## Installation Of Gensim For Word2Vec

In [80]:
conda install -c gensim


Note: you may need to restart the kernel to use updated packages.


In [81]:
from gensim.models import Word2Vec,KeyedVectors

## Import Sklearn

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

## Importing Dataset

In [83]:
df= pd.read_csv('data_scraped6.csv')

## Trying For Combination

In [84]:
Combo = df["Body"]
df = df.assign(Combo = Combo) 
df['Combo']=df['Combo'].fillna("")

In [85]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

## Taking the values from the dataset

In [86]:
Body=df['Combo'].values.tolist()

## Cleaning of the words by tokenising them and removing punctuations and stop words

In [87]:
newVec=list()
for line in Body:
    #create words token as well as remove punctuation in one go
    rem_tok_punc=RegexpTokenizer(r'\w+')
    tokens=rem_tok_punc.tokenize(line)
    
    #convert words to lower
    words=[w.lower() for w in tokens]
    #Invoke all english stopwords
    stop_word_list=list(stopwords.words('english'))
    stop_word_list.append('https')
    stop_word_list.append('www')
    stop_word_list.append('.com')
    #Remove Stop words
    words=[w for w in words if not w in stop_word_list]
    #Append words in the newVec
    newVec.append(words)
len(newVec)

4985

## Fitting Word2Vector Model 

In [88]:
model=Word2Vec(newVec,min_count=4,size=100,window=10,sg=1,iter=10)

## Number Of Words in Vocabulary

In [89]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape

  


(15541, 100)

## Finding Vector Of Each Word

In [90]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    ##print(index2word_set)
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [91]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [92]:
# Calculating average feature vector for training set
clean_train_reviews=df['Combo'].values
num_features=100
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 4985


  del sys.path[0]


Review 1000 of 4985
Review 2000 of 4985
Review 3000 of 4985
Review 4000 of 4985


  app.launch_new_instance()


In [93]:
trainDataVecs.shape

(4985, 100)

In [94]:
df.shape

(4985, 14)

In [95]:
data=trainDataVecs
traindata=pd.DataFrame(data=data[0:,0:])  # 1st row as the column names
traindata.shape

(4985, 100)

In [96]:
traindata=traindata.fillna(0)
traindata.describe

<bound method NDFrame.describe of             0         1         2         3         4         5         6   \
0    -0.334557  0.014015 -0.488889 -0.081519  0.302738  0.208021  0.198558   
1    -0.342329  0.002764 -0.478126 -0.289432  0.393219  0.181736  0.146276   
2    -0.411702  0.007607 -0.460745 -0.142816  0.278764  0.173975  0.211231   
3    -0.569433  0.014554 -0.554002 -0.299330  0.127804  0.011653  0.234700   
4    -0.370332  0.033341 -0.424054 -0.143748  0.276850  0.217330  0.232005   
...        ...       ...       ...       ...       ...       ...       ...   
4980 -0.345910  0.043230 -0.442617 -0.126216  0.241358  0.152680  0.205591   
4981 -0.357111  0.066341 -0.429225 -0.119167  0.253656  0.156615  0.214485   
4982 -0.386192  0.043247 -0.403721 -0.207423  0.230342  0.157722  0.251633   
4983  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4984  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

            7         8      

## Various Algorithms(LogisticRegression, KNN, CART,RandomForest,NaiveBayes) 

In [97]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR',LogisticRegression(max_iter=500,solver='lbfgs',multi_class='multinomial')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Rand_Forest', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append (('SDG', SGDClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
	cv_results = model_selection.cross_val_score(model, traindata,df["Flair"], cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.223273 (0.018373)
KNN: 0.227683 (0.015424)
CART: 0.538026 (0.026081)




Rand_Forest: 0.537226 (0.027622)
NB: 0.162090 (0.017297)
SDG: 0.154667 (0.024613)
