### **1/ Import Packages**

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

### **2/ Import Data (train, test, sample_submission)**

In [10]:
# defining csv file paths
train_file_path = '/content/drive/MyDrive/Colab Notebooks/DSM ML Classfication/train_jqd04QH.csv'
test_file_path = '/content/drive/MyDrive/Colab Notebooks/DSM ML Classfication/test_KaymcHn.csv'
submission_file_path = '/content/drive/MyDrive/Colab Notebooks/DSM ML Classfication/sample_submission_sxfcbdx.csv'

# reading csv files
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
submission = pd.read_csv(submission_file_path)

## **3/ Check data**

Checking data is a crucial step in the data analysis process, especially when working with datasets in data science. It helps you understand the structure and content of your data, identify potential issues, and make informed decisions about preprocessing, cleaning, and analysis. Here's why and how to check data, presented in bullet points, along with code examples:

### **Why Check Data:**

**Data Quality:** Ensures data is accurate, consistent, and reliable.

**Data Exploration**: Helps you understand the distribution of features and target variables.

**Missing Values:** Identifies missing or null values that might need to be handled.

**Outliers:** Detects unusual or erroneous data points that could impact analysis.

**Data Types:** Verifies if columns have appropriate data types.
Feature Engineering: Offers insights for creating new features or transformations.

In [11]:
print('first few rows of train dataset')
train.head(10)

first few rows of train dataset


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1
5,13342,city_21,0.624,Female,Has relevent experience,no_enrollment,Graduate,Other,8,5000-9999,Pvt Ltd,2,34,0
6,11746,city_21,0.624,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,10000+,Pvt Ltd,3,23,1
7,24127,city_114,0.926,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,<10,Pvt Ltd,>4,8,0
8,7615,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,1000-4999,Pvt Ltd,>4,10,0
9,9676,city_97,0.925,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,1000-4999,Pvt Ltd,>4,85,0


In [None]:
print('\nDataset information and datatypes:')
print('\n')
train.info()

print('\nStatestics of train dataset:')
print('\n')
train.describe()

In [None]:
print('\nChecking null values of train dataset')
print('\n')
train.isnull().sum()

**4/ Check Target feature**

In [12]:
# checking the target column for class imbalance
class_counts = train['target'].value_counts()
class_ratio = class_counts/ len(train)

# display class count and ratio
print('class imbalance in target column and ratio is: ')
print(class_counts)

# display class ratio
print('\nclass imbalance in target column class ratio is: ')
print(class_ratio)

class imbalance in target column and ratio is: 
0    15934
1     2425
Name: target, dtype: int64

class imbalance in target column class ratio is: 
0    0.867912
1    0.132088
Name: target, dtype: float64


### **5/ Partition Data into y & X (train, test)**

*   Write code to partition 'train', 'test' data
*   train into y,X
*   test into X_test
*   Use X, y to split into - X_train, X_Val, y_train, y_val
*   Exclude target and enrollee_id from X_data





In [13]:
# Define features (X) and target (y)
X = train.drop(['target', 'enrollee_id'], axis=1)
y = train['target']

# partition "test" data into X_test (features)
X_test = test.drop('enrollee_id', axis=1)

**6/ Validation Strategy - train test split**

In [14]:
# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Split X and y into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# checking the shape of all test and train
print("Shape of Split datasets are as below: ")
print('\n')
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Shape of Split datasets are as below: 


X_train: (14687, 12)
X_test: (3672, 12)
y_train: (14687,)
y_test: (3672,)


**7/ Extract numerical and categorical feature names**

In [15]:
# extracting numerical feature names from X_train
numerical_columns = X_train.select_dtypes(include=['Int64', 'float64']).columns.tolist()

# extracting categorical feature names from X_train
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

**8/ Pre processing framework (Basic)**

In [21]:
# Missing Value Treatment for Numerical features using median

# 1 - Import Package
from sklearn.impute import SimpleImputer

# 2 - Create an Instance
imputer = SimpleImputer(strategy='median')

# 3 - Fit Instance – X_train
imputer.fit(X_train[numerical_columns])

# 4 - Transform – X_train, X_val, X_test
X_train[numerical_columns] = imputer.transform(X_train[numerical_columns])
X_val[numerical_columns] = imputer.transform(X_val[numerical_columns])
X_test[numerical_columns] = imputer.transform(X_test[numerical_columns])

In [22]:
# Missing Value Treatment for categorical features using median

# 1 - Import Package
from sklearn.impute import SimpleImputer

# 2 - Create an Instance
imputer = SimpleImputer(strategy='constant', fill_value='YourConstantValue')

# 3 - Fit Instance – X_train
imputer.fit(X_train[categorical_columns])

# 4 - Transform – X_train, X_val, X_test
X_train[categorical_columns] = imputer.transform(X_train[categorical_columns])
X_val[categorical_columns] = imputer.transform(X_val[categorical_columns])
X_test[categorical_columns] = imputer.transform(X_test[categorical_columns])

In [24]:
# # Missing Value Treatment for standard Scaling using median

# 1 - Import Packages
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 2 - Create an Instance of SimpleImputer
imputer = SimpleImputer(strategy='mean')

# 3 - Fit Instance – X_train
imputer.fit(X_train[numerical_columns])

# 4 - Transform – X_train, X_val, X_test
X_train[numerical_columns] = imputer.transform(X_train[numerical_columns])
X_val[numerical_columns] = imputer.transform(X_val[numerical_columns])
X_test[numerical_columns] = imputer.transform(X_test[numerical_columns])

# 5 - Create an Instance of StandardScaler
scaler = StandardScaler()

# 6 - Fit and Transform Using StandardScaler
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [26]:
# missing value treatment for categorical encoding using Ordinal Encoding

# 1 - Import Package
from sklearn.preprocessing import OrdinalEncoder

# 2 - Create an Instance of OrdinalEncoder
encoder = OrdinalEncoder()

# 3 - Fit and Transform – X_train
X_train[categorical_columns] = encoder.fit_transform(X_train[categorical_columns])

# 4 - Transform – X_val, X_test
X_val[categorical_columns] = encoder.transform(X_val[categorical_columns])
X_test[categorical_columns] = encoder.transform(X_test[categorical_columns])

**9/ Build model, Make predictions, Check accuracy**

In [28]:
# Create an Instance of LogisticRegression
model = LogisticRegression(max_iter=1000, solver='lbfgs')

# Fit the Model on X_train and y_train
model.fit(X_train, y_train)

In [31]:
# making the prediction

y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

In [32]:
# Checking Accuracy
train_classification_report = classification_report(y_train, y_train_pred)
val_classification_report = classification_report(y_val, y_val_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# Print classification reports
print("Classification Report for X_train:")
print(train_classification_report)

Classification Report for X_train:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93     12762
           1       0.00      0.00      0.00      1925

    accuracy                           0.87     14687
   macro avg       0.43      0.50      0.46     14687
weighted avg       0.76      0.87      0.81     14687



In [34]:
print("\nClassification Report for X_val:")
print(val_classification_report)


Classification Report for X_val:
              precision    recall  f1-score   support

           0       0.86      1.00      0.93      3172
           1       1.00      0.00      0.00       500

    accuracy                           0.86      3672
   macro avg       0.93      0.50      0.47      3672
weighted avg       0.88      0.86      0.80      3672



In [41]:
# Create the DataFrame for predictions

submission_2 = pd.DataFrame({
    "enrollee_id": test["enrollee_id"],  # Assuming 'enrollee_id' is a unique identifier in the 'test' DataFrame
    "target": y_test_pred                # Replace 'y_test_pred' with the variable that contains your test set predictions
})

ValueError: ignored

**10/ Make and export submission file**

In [45]:
# Define the output file path
output_path = "/content/drive/MyDrive/Colab Notebooks/DSM ML Classfication/submission.csv"

# Export the submission DataFrame to CSV
submission.to_csv(output_path, index=False)

print("Submission DataFrame exported to:", output_path)

Submission DataFrame exported to: /content/drive/MyDrive/Colab Notebooks/DSM ML Classfication/submission.csv
