## Transductive Learning: 
Gender Name Classification

**Dataset:**  
UCI Machine Learning Repository  
*Gender by Name Dataset

**Attributes:**
- attributes first names to genders.


In [119]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

In [120]:
#Load the dataset
data = pd.read_csv('name_gender_dataset.csv')
data = data.drop(['Count', 'Probability'], axis=1)
#Convert the label to numerical data
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})

data

Unnamed: 0,Name,Gender
0,James,0
1,John,0
2,Robert,0
3,Michael,0
4,William,0
...,...,...
147264,Zylenn,0
147265,Zymeon,0
147266,Zyndel,0
147267,Zyshan,0


In [121]:
# Split the dataset into labeled and unlabeled sets
labeled_data, unlabeled_data = train_test_split(data, test_size=0.5, random_state=42, stratify=data['Gender'])

# Drop the gender labels from the unlabeled data
unlabeled_data_indices = unlabeled_data.index
unlabeled_data = unlabeled_data.drop('Gender', axis=1)

In [122]:
# Vectorize text data
vectorizer = TfidfVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['Name'])
X_unlabeled = vectorizer.transform(unlabeled_data['Name'])


y_labeled = labeled_data['Gender']

# Combine labeled and unlabeled data
combined_X = vectorizer.transform(pd.concat([labeled_data['Name'], unlabeled_data['Name']]))
combined_y = pd.concat([y_labeled, pd.Series([-1] * len(unlabeled_data))], ignore_index=True)

# Train a SelfTrainingClassifier
self_training_model = SelfTrainingClassifier(MultinomialNB())
self_training_model.fit(combined_X, combined_y)

# Predict on the unlabeled data
unlabeled_predictions = self_training_model.predict(X_unlabeled)

aligned_labels = data.loc[unlabeled_data_indices, 'Gender']

# Evaluate the model
print(f"Transductive Semi-Supervised Accuracy: {accuracy_score(aligned_labels, unlabeled_predictions)}")
print(classification_report(aligned_labels, unlabeled_predictions))

Transductive Semi-Supervised Accuracy: 0.5790724519589869
              precision    recall  f1-score   support

           0       0.23      0.03      0.06     28760
           1       0.60      0.93      0.73     44875

    accuracy                           0.58     73635
   macro avg       0.42      0.48      0.39     73635
weighted avg       0.46      0.58      0.47     73635

