# Gender Classification using Word2Vec

In [1]:
import pandas as pd
import numpy as np

In [2]:
male = pd.read_csv('male.txt',sep='\t')
female = pd.read_csv('female.txt',sep='\t')

In [3]:
male.shape,female.shape

((2819, 1), (2629, 1))

In [4]:
male['labels'] = 1
female['labels'] = 0

In [5]:
messages = pd.concat([male,female],ignore_index=True)

In [6]:
messages

Unnamed: 0,Review,labels
0,Busy but a good quality hotel. Would stay again.,1
1,"Clean, Friendly, Modern Hotel As per the title...",1
2,Great time in the Dominican I went with my now...,1
3,Decent enough hotel I have mixed feelings abou...,1
4,Convenient When I say Above Average I'm compar...,1
...,...,...
5443,"Fantastic Location, can't beat it! I recently ...",0
5444,"Fabulous Location, Chic Hotel! Hotel Granados ...",0
5445,Beautiful Resort........food not so good My hu...,0
5446,Great Break We went to The Gallery in Barcelon...,0


## Data cleaning and preprocessing

In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
corpus = []           # Corpus is preprocessed Reviews
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['Review'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
# Importing GENSIM 
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import keyedvectors
import gensim.models

In [11]:
words=[]       # List of lists containing words in Reviews 
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [12]:
words[0],len(words)

(['busy', 'good', 'quality', 'hotel', 'would', 'stay'], 5448)

In [13]:
messages.insert(1, 'Processed_Review', words)

In [14]:
messages.head()

Unnamed: 0,Review,Processed_Review,labels
0,Busy but a good quality hotel. Would stay again.,"[busy, good, quality, hotel, would, stay]",1
1,"Clean, Friendly, Modern Hotel As per the title...","[clean, friendly, modern, hotel, per, title, h...",1
2,Great time in the Dominican I went with my now...,"[great, time, dominican, went, fiance, propose...",1
3,Decent enough hotel I have mixed feelings abou...,"[decent, enough, hotel, mixed, feeling, stay, ...",1
4,Convenient When I say Above Average I'm compar...,"[convenient, say, average, comparing, sparse, ...",1


In [15]:
from sklearn.model_selection import train_test_split

In [18]:
# Encoding the label column
messages['label'] = messages['labels'].map({1:1,0:0})
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (messages['Processed_Review'], messages['label'] , test_size=0.2)

In [19]:
X_train[0],y_train[0]

(['busy', 'good', 'quality', 'hotel', 'would', 'stay'], 1)

## WORD2VEC

In [112]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=300,
                                   window=5,
                                   min_count=1)

In [113]:
w2v_model.train(X_train,total_examples=w2v_model.corpus_count,epochs=50)

(18182791, 20082550)

_____________________________________________________________________________________

In [114]:
len(w2v_model.wv.index_to_key)

18090

In [115]:
len(w2v_model.wv.vectors)

18090

In [116]:
w2v_model.wv.most_similar('bad')

[('complain', 0.3286914527416229),
 ('terrible', 0.31816184520721436),
 ('dylan', 0.31191787123680115),
 ('wrong', 0.30868852138519287),
 ('poor', 0.30851513147354126),
 ('horrible', 0.3081361949443817),
 ('funny', 0.30722224712371826),
 ('upset', 0.3066122531890869),
 ('lacking', 0.2877514362335205),
 ('sorry', 0.2872674763202667)]

In [117]:
w2v_model.wv.most_similar('good')

[('great', 0.5598031282424927),
 ('excellent', 0.4896434545516968),
 ('decent', 0.439590722322464),
 ('reasonable', 0.34724682569503784),
 ('superb', 0.34139809012413025),
 ('amazing', 0.3379555940628052),
 ('poor', 0.33348435163497925),
 ('nice', 0.3302851617336273),
 ('best', 0.319502055644989),
 ('variable', 0.3161775469779968)]

In [118]:
w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.43692314624786377)]

______________________________________________________________________

## Finding vector forms for sentences

In [119]:
word = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in word]) for ls in X_train],dtype=object)
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in word]) for ls in X_test],dtype=object)

In [120]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

## Random Forest Classifier

In [121]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
Random_forest = RandomForestClassifier()
Random_model = Random_forest.fit(X_train_vect_avg, y_train.values.ravel())

In [122]:
# Use the trained model to make predictions on the test data
y_pred = Random_model.predict(X_test_vect_avg)

In [123]:
from sklearn.metrics import precision_score, recall_score,accuracy_score
precision = precision_score(y_test, y_pred,average= 'micro')
recall = recall_score(y_test, y_pred,average= 'micro')
accuracy = accuracy_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), accuracy))


Precision: 0.626 / Recall: 0.626 / Accuracy: 0.6256880733944954


## SVM

In [124]:
from sklearn.svm import SVC # "Support vector classifier"  
svm = SVC()  
sv_model = svm.fit(X_train_vect_avg, y_train.values.ravel())  
y_pred = sv_model.predict(X_test_vect_avg)

In [125]:
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [126]:
precision = precision_score(y_test, y_pred,average= 'micro')
recall = recall_score(y_test, y_pred,average= 'micro')
accuracy = accuracy_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), accuracy))

Precision: 0.656 / Recall: 0.656 / Accuracy: 0.6559633027522935


________________________________________________________________________________________________________

In [57]:
CSResults = pd.DataFrame(columns=['Model Name','True Positive',	'False Negative',	'False Positive',	'True Negative',	'Accuracy',	'Precision',	'Recall',	'F1 Score',	'Specificity',	'MCC',	'ROC_AUC_Score',	'Balanced Accuracy'])
CSResults.head()

Unnamed: 0,Model Name,True Positive,False Negative,False Positive,True Negative,Accuracy,Precision,Recall,F1 Score,Specificity,MCC,ROC_AUC_Score,Balanced Accuracy


In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, roc_auc_score


# Build the model
modelXGB = XGBClassifier(n_estimators=100, max_depth=3, eval_metric='mlogloss')

bankdataSVM = SVC(C=1.0, kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, 
                  probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, 
                  max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)
ModelKNN = KNeighborsClassifier(n_neighbors=5)
modelGNB = GaussianNB()
ModelLR = LogisticRegression()
ModelDC = DecisionTreeClassifier()
ModelRF = RandomForestClassifier()
ModelET = ExtraTreesClassifier()

# Evalution matrix for all the algorithm

MM = [ModelLR, ModelDC, ModelRF, ModelET, modelGNB, ModelKNN, bankdataSVM]
for models in MM:
            
    # Train the model training dataset
    
    models.fit(X_train_vect_avg, y_train.values)
    
    # Prediction the model with test dataset
    
    y_pred = models.predict(X_test_vect_avg)
    y_pred_prob = models.predict_proba(X_test_vect_avg)

    
    # confusion matrix in sklearn

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    # actual values

    actual = y_test

    # predicted values

    predicted = y_pred

    tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
    # calculating the metrics

    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);

    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
    # A model with a score of +1 is a perfect model and -1 is a poor model

    from math import sqrt

    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

    from sklearn.metrics import roc_curve, roc_auc_score
    #------------------------------------------------------
    new_row = {'Model Name' : models,
               'True Positive': tp,
               'False Negative': fn, 
               'False Positive': fp, 
               'True Negative': tn,
               'Accuracy' : accuracy,
               'Precision' : precision,
               'Recall' : sensitivity,
               'F1 Score' : f1Score,
               'Specificity' : specificity,
               'MCC': MCC,
               'ROC_AUC_Score':roc_auc_score(actual, y_pred),
               'Balanced Accuracy':balanced_accuracy}
    CSResults = CSResults.append(new_row,ignore_index=True)
    #------------------------------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  CSResults = CSResults.append(new_row,ignore_index=True)
  CSResults = CSResults.append(new_row,ignore_index=True)
  CSResults = CSResults.append(new_row,ignore_index=True)
  CSResults = CSResults.append(new_row,ignore_index=True)
  CSResults = CSResults.append(new_row,ignore_index=True)
  CSResults = CSResults.append(new_row,ignore_index=True)
  CSResults = CSResults.append(new_row,ignore_index=True)


In [59]:
CSResults

Unnamed: 0,Model Name,True Positive,False Negative,False Positive,True Negative,Accuracy,Precision,Recall,F1 Score,Specificity,MCC,ROC_AUC_Score,Balanced Accuracy
0,LogisticRegression(),393,191,190,316,0.65,0.674,0.673,0.674,0.625,0.297,0.648726,0.649
1,DecisionTreeClassifier(),333,251,233,273,0.556,0.588,0.57,0.579,0.54,0.11,0.554866,0.555
2,"(DecisionTreeClassifier(max_features='auto', r...",364,220,194,312,0.62,0.652,0.623,0.637,0.617,0.239,0.619944,0.62
3,"(ExtraTreeClassifier(random_state=755008496), ...",366,218,232,274,0.587,0.612,0.627,0.619,0.542,0.169,0.584107,0.584
4,GaussianNB(),445,139,324,182,0.575,0.579,0.762,0.658,0.36,0.133,0.560835,0.561
5,KNeighborsClassifier(),360,224,258,248,0.558,0.583,0.616,0.599,0.49,0.107,0.553278,0.553
6,"SVC(kernel='linear', probability=True)",397,187,181,325,0.662,0.687,0.68,0.683,0.642,0.322,0.661044,0.661


In [60]:
CSResults.to_csv('CsResults.csv')

______________________________________________________________________

## SAMPLE TESTING

In [61]:
def preprocess(sample):
  sample_corpus = []
  for i in range(0, 1):
    # review = review.str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'," ")
    review = re.sub('[^a-zA-Z]', ' ', sample)
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    sample_corpus.append(review)
    
  sample_words=[]
  for sent in sample_corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        sample_words.append(simple_preprocess(sent))
  # return sample_words
  return w2v_model.wv[sample_words[0]]

In [67]:
def avg_word2vec(doc):
  return np.mean(list(doc),axis=0)

In [68]:
def RPredict(s):
    if Random_model.predict(s) == [0]:
        return "FEMALE"
    else:
        return "MALE"
def SPredict(s):
    if sv_model.predict(s) == [0]:
        return "FEMALE"
    else:
        return "MALE"

In [78]:
s = avg_word2vec(preprocess('Busy but a good quality hotel. Would stay again.'))
s = s.reshape(1,-1)

print("Random Forest Prediction : ",RPredict(s))
print("SVM Prediction : ", SPredict(s))
print("LogisticRegression : ",ModelLR.predict(s))
print("DecisionTreeClassifier : ",ModelDC.predict(s))
print("ExtraTreesClassifier : ",ModelET.predict(s))
print("GaussianNB : ",modelGNB.predict(s))
print("KNeighborsClassifier : ",ModelKNN.predict(s))

Random Forest Prediction :  MALE
SVM Prediction :  MALE
LogisticRegression :  [1]
DecisionTreeClassifier :  [1]
ExtraTreesClassifier :  [1]
GaussianNB :  [1]
KNeighborsClassifier :  [1]


In [79]:
s = avg_word2vec(preprocess('''Without doubt one of the favorite place I stayed during my "Solo Travel time in".Eco Resort has great rooms with excellent services and location was also a bonus. Proprietor himself takes care of the guests. Very friendly and competent staff, extremely helpful yet never intrusive.
I stayed here for 5 days. My room was big, clean and very comfortable, the view from my room was of snow covered mountains. It felt like, I was the only person staying there, very relaxing, very calm, very serene.
I would highly recommend to everybody, especially for any FEMALE SOLO TRAVELER! It's a true sample of excellent hospitality.
Thank you again Eco Resort for going just that little step extra for the guests!!!'''))
s = s.reshape(1,-1)

print("Random Forest Prediction : ",RPredict(s))
print("SVM Prediction : ", SPredict(s))
print("LogisticRegression : ",ModelLR.predict(s))
print("DecisionTreeClassifier : ",ModelDC.predict(s))
print("ExtraTreesClassifier : ",ModelET.predict(s))
print("GaussianNB : ",modelGNB.predict(s))
print("KNeighborsClassifier : ",ModelKNN.predict(s))

Random Forest Prediction :  FEMALE
SVM Prediction :  FEMALE
LogisticRegression :  [0]
DecisionTreeClassifier :  [1]
ExtraTreesClassifier :  [0]
GaussianNB :  [0]
KNeighborsClassifier :  [1]
