# Gender Classification using NLP

In [32]:
import pandas as pd
import numpy as np

In [33]:
male = pd.read_csv('male.txt',sep='\t')
female = pd.read_csv('female.txt',sep='\t')

In [34]:
male.shape,female.shape

((2819, 1), (2629, 1))

In [35]:
male['labels'] = 1
female['labels'] = 0

In [36]:
messages = pd.concat([male,female],ignore_index=True)

In [37]:
messages

Unnamed: 0,Review,labels
0,Busy but a good quality hotel. Would stay again.,1
1,"Clean, Friendly, Modern Hotel As per the title...",1
2,Great time in the Dominican I went with my now...,1
3,Decent enough hotel I have mixed feelings abou...,1
4,Convenient When I say Above Average I'm compar...,1
...,...,...
5443,"Fantastic Location, can't beat it! I recently ...",0
5444,"Fabulous Location, Chic Hotel! Hotel Granados ...",0
5445,Beautiful Resort........food not so good My hu...,0
5446,Great Break We went to The Gallery in Barcelon...,0


## Data cleaning and preprocessing

In [38]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
corpus = []           # Corpus is preprocessed Reviews
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['Review'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [41]:
# Importing GENSIM 
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import keyedvectors
import gensim.models

In [42]:
words=[]       # List of lists containing words in Reviews 
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [43]:
words[0],len(words)

(['busy', 'good', 'quality', 'hotel', 'would', 'stay'], 5448)

## WORD2VEC

In [44]:
Word2Vecmodel = gensim.models.Word2Vec(words,vector_size = 300,min_count=5)
# Word2Vecmodel.build_vocab(words)
Word2Vecmodel.train(words,total_examples = Word2Vecmodel.corpus_count,epochs=30)

(13006742, 15166380)

In [45]:
Word2Vecmodel.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.5113424062728882)]

In [47]:
Word2Vecmodel.wv.most_similar('good')

[('decent', 0.6054075956344604),
 ('great', 0.6017729640007019),
 ('excellent', 0.5534951686859131),
 ('reasonable', 0.45005112886428833),
 ('nice', 0.41022199392318726),
 ('ok', 0.3944856524467468),
 ('average', 0.3805086612701416),
 ('exceptional', 0.37877753376960754),
 ('unbeatable', 0.37695351243019104),
 ('plentiful', 0.3644424378871918)]

In [48]:
Word2Vecmodel.wv.most_similar('bad')

[('horrible', 0.4699917137622833),
 ('terrible', 0.4161779582500458),
 ('complain', 0.37273848056793213),
 ('poor', 0.3718932867050171),
 ('worse', 0.353372186422348),
 ('good', 0.34453627467155457),
 ('awful', 0.33780789375305176),
 ('worst', 0.32984551787376404),
 ('complains', 0.3258015215396881),
 ('wrong', 0.3218424916267395)]

In [56]:
len(Word2Vecmodel.wv.vectors)

5993

In [None]:
Word2Vecmodel.wv.vectors[0]