In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
import time
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer



# Loading & Labelling data

In [4]:
female=pd.read_csv('female.txt',sep="\n")
female.columns=['Name']
female['title']='Ms' 
female['Gender']=0
female=female.iloc[:, [1,0,2]]
female

Unnamed: 0,title,Name,Gender
0,Ms,Abagail,0
1,Ms,Abbe,0
2,Ms,Abbey,0
3,Ms,Abbi,0
4,Ms,Abbie,0
...,...,...,...
4995,Ms,Zorine,0
4996,Ms,Zsa Zsa,0
4997,Ms,Zsazsa,0
4998,Ms,Zulema,0


In [5]:
female=female.head(2942) 
female

Unnamed: 0,title,Name,Gender
0,Ms,Abagail,0
1,Ms,Abbe,0
2,Ms,Abbey,0
3,Ms,Abbi,0
4,Ms,Abbie,0
...,...,...,...
2937,Ms,Leilah,0
2938,Ms,Leisha,0
2939,Ms,Lela,0
2940,Ms,Lelah,0


In [6]:
male=pd.read_csv('male.txt',sep="\n")
male.columns=['Name']
male['title']='Mr'
male['Gender']=1
male=male.iloc[:, [1,0,2]] 
male

Unnamed: 0,title,Name,Gender
0,Mr,Aaron,1
1,Mr,Abbey,1
2,Mr,Abbie,1
3,Mr,Abbot,1
4,Mr,Abbott,1
...,...,...,...
2937,Mr,Zeus,1
2938,Mr,Zippy,1
2939,Mr,Zollie,1
2940,Mr,Zolly,1


In [7]:
frames=[male,female]
df=pd.concat(frames)
df=df.sample(frac=1) 
df

Unnamed: 0,title,Name,Gender
610,Ms,Bobette,0
1092,Mr,Hansel,1
1385,Ms,Donica,0
1328,Ms,Devina,0
1186,Ms,Danette,0
...,...,...,...
2480,Ms,Joelly,0
568,Mr,Darby,1
1770,Ms,Feodora,0
1185,Ms,Danelle,0


In [8]:
le = LabelEncoder()
title=le.fit_transform(df['title'])
title.size

5884

In [9]:
X=df[['Name','title']] 
X

Unnamed: 0,Name,title
610,Bobette,Ms
1092,Hansel,Mr
1385,Donica,Ms
1328,Devina,Ms
1186,Danette,Ms
...,...,...
2480,Joelly,Ms
568,Darby,Mr
1770,Feodora,Ms
1185,Danelle,Ms


In [10]:

y=df['Gender']
ohe=OneHotEncoder()
ohe.fit(X)
x=ohe.transform(X)
x

<5884x5679 sparse matrix of type '<class 'numpy.float64'>'
	with 11768 stored elements in Compressed Sparse Row format>

In [11]:

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
X_train

<4118x5679 sparse matrix of type '<class 'numpy.float64'>'
	with 8236 stored elements in Compressed Sparse Row format>

In [12]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
#grid.fit(X_train, y_train)
# Perform classification with SVM, kernel=linear
#classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
grid.fit(X_train, y_train)
print(grid.best_params_)
t1 = time.time()
prediction_linear = grid.predict(X_test)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear, output_dict=True)
print(report)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.1s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=1.000 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=1.000 total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=1.000 total time=   0.1s
[CV 4/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=1.000 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=1.000 total time=   0.0s
[CV 1/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=1.000 total time=   0.0s
[CV 2/5] END ..C=0.1, gamma=0.01, kernel=linear