## Transductive Learning: 
Gender Name Classification

**Dataset:**  
UCI Machine Learning Repository  
*Gender by Name Dataset

**Attributes:**
- attributes first names to genders.


In [247]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

In [248]:
#Load the dataset
data = pd.read_csv('name_gender_dataset.csv')
data = data.drop(['Count', 'Probability'], axis=1)
#Convert the label to numerical data
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})

data

Unnamed: 0,Name,Gender
0,James,0
1,John,0
2,Robert,0
3,Michael,0
4,William,0
...,...,...
147264,Zylenn,0
147265,Zymeon,0
147266,Zyndel,0
147267,Zyshan,0


In [249]:
label_counts = data['Gender'].value_counts()
print(label_counts)


Gender
1    89749
0    57520
Name: count, dtype: int64


In [250]:

null_labels = data['Gender'].isnull().sum()
print(f"Number of null labels: {null_labels}")


empty_labels = (data['Gender'] == '').sum()
print(f"Number of empty labels: {empty_labels}")

rows_without_labels = data[data['Gender'].isnull() | (data['Gender'] == '')]
print(f"Rows without labels: {len(rows_without_labels)}")

Number of null labels: 0
Number of empty labels: 0
Rows without labels: 0


In [251]:
# Split the dataset into labeled and unlabeled sets
labeled_data, unlabeled_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['Gender'])

# Drop the gender labels from the unlabeled data
unlabeled_data_indices = unlabeled_data.index
unlabeled_data = unlabeled_data.drop('Gender', axis=1)

print("Number of labeled instances:", labeled_data.shape[0])
print("Number of unlabeled instances:", unlabeled_data.shape[0])


Number of labeled instances: 73634
Number of unlabeled instances: 73635


In [252]:
unlabeled_data

Unnamed: 0,Name
6184,Penni
57041,Rakshan
113132,Lara-Marina
56032,Tahje
8307,Arlean
...,...
125891,Maria-Josefa
85281,Roshea
2828,Jed
31738,Sefora


In [253]:
# Vectorize text data
vectorizer = TfidfVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['Name'])
X_unlabeled = vectorizer.transform(unlabeled_data['Name'])

# Prepare labels for the labeled data
y_labeled = labeled_data['Gender']

# Combine labeled and unlabeled data
combined_X = vectorizer.transform(pd.concat([labeled_data['Name'], unlabeled_data['Name']]))
combined_y = pd.concat([y_labeled, pd.Series([-1] * len(unlabeled_data))], ignore_index=True)

# Train a SelfTrainingClassifier
self_training_model = SelfTrainingClassifier(MultinomialNB())
self_training_model.fit(combined_X, combined_y)

# Predict on the unlabeled data
unlabeled_predictions = self_training_model.predict(X_unlabeled)

# Evaluate the model
aligned_labels = data.loc[unlabeled_data_indices, 'Gender']
accuracy = accuracy_score(aligned_labels, unlabeled_predictions)
classification_rep = classification_report(aligned_labels, unlabeled_predictions)

print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(classification_rep)


Accuracy: 0.5790724519589869

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.03      0.06     28760
           1       0.60      0.93      0.73     44875

    accuracy                           0.58     73635
   macro avg       0.42      0.48      0.39     73635
weighted avg       0.46      0.58      0.47     73635



In [254]:
unlabeled_results = pd.DataFrame({
    'Name': unlabeled_data['Name'],
    'Predicted_Gender': unlabeled_predictions
})
print("Names and Predicted Genders:")
unlabeled_results 

Names and Predicted Genders:


Unnamed: 0,Name,Predicted_Gender
6184,Penni,1
57041,Rakshan,1
113132,Lara-Marina,1
56032,Tahje,0
8307,Arlean,1
...,...,...
125891,Maria-Josefa,1
85281,Roshea,1
2828,Jed,1
31738,Sefora,1
