#1. Loading required modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
df = pd.read_csv('gender_classification_v7.csv')
features = df.loc[:, df.columns != 'gender']
label = df['gender']
print(features)
print(df.isna().sum())

#3. Check for missing values:

In [None]:
print(df.isna().sum())

In [None]:
def add_missing_values(X_full):
    import numpy as np
    Col_names=X_full.columns
    X_full=X_full.to_numpy()
    rng = np.random.RandomState(4)
    n_samples, n_features = X_full.shape

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(n_samples * missing_rate)

    missing_samples = np.zeros(n_samples, dtype=bool)
    missing_samples[:n_missing_samples] = True

    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)
    X_missing = X_full.copy()
    X_missing[missing_samples, missing_features] = np.nan
    X_missing=pd.DataFrame(X_missing)
    X_missing.columns=Col_names
    return X_missing



In [None]:
features_df_with_missing_values=add_missing_values(features)
print(features_df_with_missing_values.isna().sum())
print(features_df_with_missing_values.head(60))


Imputing the missing values using three different methods and assign the imputed output datasets into variables:

In [None]:
#First way, imputing by row mean
df_features_row_mean= features_df_with_missing_values.copy().T
imputer_mean = SimpleImputer(missing_values = np.nan, strategy = "mean")
imputer_mean = imputer_mean.fit(df_features_row_mean)
df_features_row_mean = imputer_mean.transform(df_features_row_mean).T
df_features_row_mean = pd.DataFrame(df_features_row_mean, columns=features_df_with_missing_values.columns)
print(df_features_row_mean.isna().sum())

#Second way, imputing by column most frequent
df_features_col_most_frequent=features_df_with_missing_values.copy()
imputer_col_most_frequent = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
imputer_col_most_frequent = imputer_col_most_frequent.fit(df_features_col_most_frequent)
df_features_col_most_frequent = imputer_col_most_frequent.transform(df_features_col_most_frequent)
df_features_col_most_frequent = pd.DataFrame(df_features_col_most_frequent, columns=features_df_with_missing_values.columns)
print(df_features_col_most_frequent.isna().sum())


#Third way, imputing using KNN
df_features_KNN = features_df_with_missing_values.copy()
imputer_KNN = KNNImputer(n_neighbors = 3)
imputer_KNN = imputer_KNN.fit(df_features_KNN)
df_features_KNN = imputer_KNN.transform(df_features_KNN)
df_features_KNN = pd.DataFrame(df_features_KNN, columns=features_df_with_missing_values.columns)
print(df_features_KNN.isna().sum())

#5. Preprocessing - performing Normalization for each one of the 3 methods


In [None]:
#Normalizing for first way(row mean)
transformer_row_mean = preprocessing.Normalizer()
df_row_mean_normalized = transformer_row_mean.transform(df_features_row_mean)
df_row_mean_normalized = pd.DataFrame(df_row_mean_normalized, columns=features_df_with_missing_values.columns)
print(df_row_mean_normalized.head(10))

#Normalizing for second way(column most frequent)
transformer_column_most_frequent = preprocessing.Normalizer()
df_column_most_frequent_normalized = transformer_column_most_frequent.transform(df_features_col_most_frequent)
df_column_most_frequent_normalized = pd.DataFrame(df_column_most_frequent_normalized, columns=features_df_with_missing_values.columns)
print(df_column_most_frequent_normalized.head(10))

#Normalizing for third way(KNN imputation)
transformer_KNN = preprocessing.Normalizer()
df_KNN_normalized = transformer_KNN.transform(df_features_KNN)
df_KNN_normalized = pd.DataFrame(df_KNN_normalized, columns=features_df_with_missing_values.columns)
print(df_KNN_normalized.head(10))

"""Q5 - B"""

sns.countplot(x= label)
# i used chatgpt here to find the value counts command in seaborn
print(label.value_counts())


Performing data balancing

In [25]:
copy_features = features.copy()
copy_label = label.copy()
nm_version = 2
near_miss = NearMiss(version=nm_version)
Features_Resampled_NM, Label_Resampled_NM = near_miss.fit_resample(copy_features, copy_label)
print(Label_Resampled_NM.value_counts())

smote = SMOTE()
Features_Resampled_SMOTE, Label_Resampled_SMOTE = smote.fit_resample(features, label)
print(Label_Resampled_SMOTE.value_counts())

Female    2500
Male      2500
Name: gender, dtype: int64
Male      2501
Female    2501
Name: gender, dtype: int64


"\n#all of this is from chat gpt\nSMOTE (Synthetic Minority Over-sampling Technique):\nSMOTE addresses class imbalance by creating synthetic samples for the minority class. It selects an instance, finds its nearest neighbors from the same class, and generates new instances along the line connecting them. This expands the minority class, improving its representation. Advantages: Effective in increasing minority class samples, mitigating class imbalance. Disadvantages: Can introduce noise, leading to overfitting. Doesn't consider feature distribution, which might impact quality of synthetic samples."

#7. Train test split:

In [26]:
#first df
X_train_row_mean, X_test_row_mean, y_train_row_mean, y_test_row_mean = train_test_split(df_row_mean_normalized, label, test_size = 0.2, random_state=5)

#Second df
X_train_col_most_frequent, X_test_col_most_frequent, y_train_col_most_frequent, y_test_col_most_frequent = train_test_split(df_column_most_frequent_normalized, label, test_size = 0.2, random_state=5)

#Third df
X_train_KNN, X_test_KNN, y_train_KNN, y_test_KNN = train_test_split(df_KNN_normalized, label, test_size = 0.2, random_state=5)


#8. Training models (for both imputed datasets):



In [27]:
"""Q8 - A"""
# The three chosen models are: Logistic Regression, Catboost, Naive Bayes Classifier on train set

'''Logistic Regression'''
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

'''XGBoost'''
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
model2 = xgb.XGBClassifier()
label_encoder = preprocessing.LabelEncoder()
encoded_labels_train = label_encoder.fit_transform(y_train)
encoded_labels_test = label_encoder.transform(y_test)
model2.fit(X_train, encoded_labels_train)
y_pred = model2.predict(X_test)
accuracy = accuracy_score(encoded_labels_test, y_pred)
print("Accuracy:", accuracy)

'''Naive Bayes'''
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

'''Q8 - B'''
# Used ChatGPT for hyperparameters
# Logistic Regression:

lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
# First df
lr_classifier = LogisticRegression(random_state=42)
lr_grid_search = GridSearchCV(lr_classifier, lr_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
lr_grid_search.fit(X_train_row_mean, y_train_row_mean)
best_lr_model = lr_grid_search.best_estimator_
y_pred_lr1 = best_lr_model.predict(X_test_row_mean)
accuracy_lr1 = accuracy_score(y_test_row_mean, y_pred_lr1)
print("Logistic Regression Accuracy for the first DF:", accuracy_lr1)

# Second df
lr_grid_search.fit(X_train_col_most_frequent, y_train_col_most_frequent)
best_lr_model = lr_grid_search.best_estimator_
y_pred_lr2 = best_lr_model.predict(X_test_col_most_frequent)
accuracy_lr2 = accuracy_score(y_test_col_most_frequent, y_pred_lr2)
print("Logistic Regression Accuracy for the Second DF:", accuracy_lr2)

# Third df
lr_grid_search.fit(X_train_KNN, y_train_KNN)
best_lr_model = lr_grid_search.best_estimator_
y_pred_lr3 = best_lr_model.predict(X_test_KNN)
accuracy_lr3 = accuracy_score(y_test_KNN, y_pred_lr3)
print("Logistic Regression Accuracy for the Third DF:", accuracy_lr3)

# Gaussian Naive Bayes:
nb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}
# First df
nb_classifier = GaussianNB()
nb_grid_search = GridSearchCV(nb_classifier, nb_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
nb_grid_search.fit(X_train_row_mean, y_train_row_mean)
best_nb_model = nb_grid_search.best_estimator_
y_pred_nb1 = best_nb_model.predict(X_test_row_mean)
accuracy_nb1 = accuracy_score(y_test_row_mean, y_pred_nb1)
print("Naive Bayes Accuracy for first df:", accuracy_nb1)

# Second df
nb_grid_search.fit(X_train_col_most_frequent, y_train_col_most_frequent)
best_nb_model = nb_grid_search.best_estimator_
y_pred_nb2 = best_nb_model.predict(X_test_col_most_frequent)
accuracy_nb2 = accuracy_score(y_test_col_most_frequent, y_pred_nb2)
print("Naive Bayes Accuracy for the second df:", accuracy_nb2)

# Third df
nb_grid_search.fit(X_train_KNN, y_train_KNN)
best_nb_model = nb_grid_search.best_estimator_
y_pred_nb3 = best_nb_model.predict(X_test_KNN)
accuracy_nb3 = accuracy_score(y_test_KNN, y_pred_nb3)
print("Naive Bayes Accuracy for the Third df:", accuracy_nb3)


# XGBoost
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}
grid_search = GridSearchCV(model2, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, encoded_labels_train)
best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
accuracy = accuracy_score(encoded_labels_test, y_pred)
print("Accuracy:", accuracy)

''' Q8 - C'''
best_lr_hyperparameters = lr_grid_search.best_params_
print("Best Hyperparameters for Logistic Regression:", best_lr_hyperparameters)

best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
accuracy = accuracy_score(encoded_labels_test, y_pred)
print("Best Hyperparameters for XGBoost:", grid_search.best_params_)

best_nb_model = nb_grid_search.best_estimator_
best_nb_hyperparameters = best_nb_model.get_params()
print("Best Hyperparameters for Multinomial Naive Bayes:", best_nb_hyperparameters)

Accuracy: 0.9600399600399601
Accuracy: 0.961038961038961
Number of mislabeled points out of a total 1001 points : 36
Logistic Regression Accuracy for the first DF: 0.8151848151848152
Logistic Regression Accuracy for the Second DF: 0.9300699300699301
Logistic Regression Accuracy for the Third DF: 0.949050949050949
Naive Bayes Accuracy for first df: 0.8551448551448552
Naive Bayes Accuracy for the second df: 0.936063936063936
Naive Bayes Accuracy for the Third df: 0.948051948051948
Accuracy: 0.968031968031968
Best Hyperparameters for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Best Hyperparameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best Hyperparameters for Multinomial Naive Bayes: {'priors': None, 'var_smoothing': 1e-09}


Splitting to train and test and performing training and prediction for each model

In [28]:

imputed_datasets = [('df_row_mean', df_row_mean_normalized), ('df_col_most_frequent', df_column_most_frequent_normalized), ('df_KNN', df_KNN_normalized)]

for dataset_name, dataset in imputed_datasets:
  X_train, X_test, y_train, y_test = train_test_split(dataset, label, test_size=0.2, random_state=42)

  log_model = LogisticRegression()

  LR = log_model.fit(X_train, y_train)

  LR_train_preds = LR.predict(X_train)
  print(f"train prediction logistic regression {dataset_name}\n" + classification_report(LR_train_preds, y_train))

  LR_test_preds = LR.predict(X_test)
  print(f"test prediction logistic regression {dataset_name}\n" + classification_report(LR_test_preds, y_test))


  label_encoder = preprocessing.LabelEncoder()
  label_encoded = label_encoder.fit_transform(label)
  X_train, X_test, y_train, y_test = train_test_split(dataset, label_encoded, test_size=0.2, random_state=42)

  XGBmodel = XGBClassifier()

  XGB = XGBmodel.fit(X_train, y_train)

  XGB_train_preds = XGB.predict(X_train)
  print(f"train prediction XGB {dataset_name} \n" + classification_report(XGB_train_preds, y_train))

  XGB_test_preds = XGB.predict(X_test)
  print(f"test prediction XGB {dataset_name}\n" + classification_report(XGB_test_preds, y_test))


  X_train, X_test, y_train, y_test = train_test_split(dataset, label_encoded, test_size=0.2, random_state=42)

  GNBmodel = GaussianNB()

  GNB = GNBmodel.fit(X_train, y_train)

  GNB_train_preds = GNB.predict(X_train)
  print(f"train prediction GNB {dataset_name} \n" + classification_report(GNB_train_preds, y_train))

  GNB_test_preds = GNB.predict(X_test)
  print(f"test prediction GNB {dataset_name} \n" + classification_report(GNB_test_preds, y_test))


train prediction logistic regression df_row_mean
              precision    recall  f1-score   support

      Female       0.81      0.82      0.82      1974
        Male       0.83      0.81      0.82      2026

    accuracy                           0.82      4000
   macro avg       0.82      0.82      0.82      4000
weighted avg       0.82      0.82      0.82      4000

test prediction logistic regression df_row_mean
              precision    recall  f1-score   support

      Female       0.80      0.80      0.80       501
        Male       0.80      0.80      0.80       500

    accuracy                           0.80      1001
   macro avg       0.80      0.80      0.80      1001
weighted avg       0.80      0.80      0.80      1001

train prediction XGB df_row_mean 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000
           1       1.00      1.00      1.00      2000

    accuracy                           1.00      40

#b. Verbally explain which model resulted with the best outcome with consideration to over-fitting, under-fitting and proper-fitting.

#Printing classification_report of the test set using the best model 

In [29]:
imputed_datasets = [('df_row_mean', df_row_mean_normalized), ('df_col_most_frequent', df_column_most_frequent_normalized), ('df_KNN', df_KNN_normalized)]

for dataset_name, dataset in imputed_datasets:
  X_train, X_test, y_train, y_test = train_test_split(dataset, label, test_size=0.2, random_state=42)

  log_model = LogisticRegression()

  LR = log_model.fit(X_train, y_train)

  LR_test_preds = LR.predict(X_test)
  print(f"test prediction logistic regression {dataset_name}\n" + classification_report(LR_test_preds, y_test))

test prediction logistic regression df_row_mean
              precision    recall  f1-score   support

      Female       0.80      0.80      0.80       501
        Male       0.80      0.80      0.80       500

    accuracy                           0.80      1001
   macro avg       0.80      0.80      0.80      1001
weighted avg       0.80      0.80      0.80      1001

test prediction logistic regression df_col_most_frequent
              precision    recall  f1-score   support

      Female       0.91      0.95      0.93       480
        Male       0.96      0.92      0.94       521

    accuracy                           0.93      1001
   macro avg       0.93      0.93      0.93      1001
weighted avg       0.94      0.93      0.93      1001

test prediction logistic regression df_KNN
              precision    recall  f1-score   support

      Female       0.94      0.95      0.95       495
        Male       0.95      0.94      0.95       506

    accuracy                      