### Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, cohen_kappa_score

### Load the dataset

In [None]:
Gender = pd.read_csv("Gender.csv")

In [None]:
Gender.groupby('user_gender')['user_gender'].count()

user_gender
F          289871
M          373480
Unknown     80981
Name: user_gender, dtype: int64

### Prepare the data for machine learning by eliminating instances with 'Unknown' gender values, and then separating the features (X) from the target variable (y).

In [None]:
# Exclude instances with "Unknown" gender values
data = Gender[Gender['user_gender'] != 'Unknown']
# Split the dataset into features (X) and target (y)
X = data.drop('user_gender', axis=1)
y = data['user_gender']

### Split the data into training and testing sets, initialize and train the decision tree classifier, and then predict the gender for the test set.

In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predict the gender for the test set
y_pred = decision_tree.predict(X_test)

### Evaluate the Model Performance: Accuracy, Recall, Precision, and Confusion Matrix

In [None]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
# Calculate recall, precision, and confusion matrix
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
# Print recall, precision, confusion matrix, and kappa score
print("Recall:", recall)
print("Precision:", precision)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9635715416330622
Recall: 0.9635715416330622
Precision: 0.9635742902877916
Confusion Matrix:
 [[55576  2398]
 [ 2435 72262]]


### Impute "Unknown" Gender Values with Predicted Gender Using Decision Tree Model

In [None]:

# Filter out instances with "Unknown" gender values
unknown_indices = Gender[Gender['user_gender'] == 'Unknown'].index
unknown_data = Gender.loc[unknown_indices]

# Remove the "Unknown" instances for prediction
X_unknown = unknown_data.drop('user_gender', axis=1)

# Predict the gender for the instances with "Unknown" gender values
predicted_gender = decision_tree.predict(X_unknown) #predic. prob.

# Replace the "Unknown" values with the predicted values
Gender.loc[unknown_indices, 'user_gender'] = predicted_gender

Gender


Unnamed: 0,user_crm_id,user_gender,return_status,item_main_category,item_brand,item_sub_category_code,item_gender_code,has_coupon,opt_in_status,prism_plus_status_code,prism_plus_tier
0,7535440,F,0,4,17,7,2,0,0,0,0
1,7535440,F,0,2,17,8,2,0,0,0,0
2,7537040,F,0,4,86,7,2,0,0,0,0
3,7535945,F,0,4,17,7,2,0,0,0,0
4,7535945,F,0,4,17,7,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
744327,5490168,M,1,8,78,23,1,0,0,0,0
744328,4988352,F,1,4,6,7,2,0,0,0,0
744329,4988352,F,1,4,6,7,2,0,0,0,0
744330,5487045,F,0,8,6,23,3,0,0,0,0


In [None]:
Gender.groupby('user_gender')['user_gender'].count()

user_gender
F    329807
M    414525
Name: user_gender, dtype: int64

### Save only the user_crm_id and user_gender columns to a CSV file

In [None]:
data_unknown[['user_crm_id', 'user_gender']].to_csv("Gender_user_crm_id_predicted.csv", index=False)
