## Installation Of Gensim For Word2Vec

In [29]:
conda install -c gensim


Note: you may need to restart the kernel to use updated packages.


In [30]:
from gensim.models import Word2Vec,KeyedVectors

## Import Sklearn

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

## Importing Dataset

In [32]:
df= pd.read_csv('data_scraped6.csv')

## Trying For Combination

In [33]:
Combo = df["Title"]+df["Url_address"]
df = df.assign(Combo = Combo) 
df['Combo']=df['Combo'].fillna("")

In [34]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

## Taking the values from the dataset

In [35]:
Body=df['Combo'].values.tolist()

## Cleaning of the words by tokenising them and removing punctuations and stop words

In [36]:
newVec=list()
for line in Body:
    #create words token as well as remove punctuation in one go
    rem_tok_punc=RegexpTokenizer(r'\w+')
    tokens=rem_tok_punc.tokenize(line)
    
    #convert words to lower
    words=[w.lower() for w in tokens]
    #Invoke all english stopwords
    stop_word_list=list(stopwords.words('english'))
    stop_word_list.append('https')
    stop_word_list.append('www')
    stop_word_list.append('.com')
    #Remove Stop words
    words=[w for w in words if not w in stop_word_list]
    #Append words in the newVec
    newVec.append(words)
len(newVec)

4985

## Fitting Word2Vector Model 

In [37]:
model=Word2Vec(newVec,min_count=4,size=100,window=10,sg=1,iter=10)

## Number Of Words in Vocabulary

In [38]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape

  


(4242, 100)

## Finding Vector Of Each Word

In [39]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    ##print(index2word_set)
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [40]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [41]:
# Calculating average feature vector for training set
clean_train_reviews=df['Combo'].values
num_features=100
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 4985


  del sys.path[0]


Review 1000 of 4985
Review 2000 of 4985
Review 3000 of 4985
Review 4000 of 4985


In [42]:
trainDataVecs.shape

(4985, 100)

In [43]:
df.shape

(4985, 14)

In [44]:
data=trainDataVecs
traindata=pd.DataFrame(data=data[0:,0:])  # 1st row as the column names
traindata.shape

(4985, 100)

In [45]:
traindata=traindata.fillna(0)
traindata.describe

<bound method NDFrame.describe of             0         1         2         3         4         5         6   \
0     0.089642 -0.069349 -0.000922 -0.123347 -0.262326 -0.039725 -0.191089   
1     0.059134 -0.051073 -0.090622 -0.131229 -0.231351 -0.033073 -0.153262   
2     0.079827 -0.062014 -0.010866 -0.119944 -0.267130 -0.024986 -0.193352   
3     0.118036 -0.042561 -0.038508 -0.155216 -0.226599 -0.157615 -0.175355   
4     0.149052 -0.067494 -0.011343 -0.166730 -0.244366 -0.123486 -0.241577   
...        ...       ...       ...       ...       ...       ...       ...   
4980  0.136590 -0.076115  0.027155 -0.112761 -0.345436 -0.033608 -0.222134   
4981  0.113877 -0.065642 -0.067232 -0.154733 -0.263175 -0.061450 -0.191191   
4982  0.123378 -0.072741 -0.053482 -0.137665 -0.277063 -0.012661 -0.192086   
4983  0.130995 -0.093749  0.028131 -0.128906 -0.300404  0.039696 -0.236086   
4984  0.137196 -0.061256 -0.027214 -0.117612 -0.263530 -0.076538 -0.195195   

            7         8      

## Various Algorithms(LogisticRegression, KNN, CART,RandomForest,NaiveBayes) 

In [46]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR',LogisticRegression(max_iter=500,solver='lbfgs',multi_class='multinomial')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Rand_Forest', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append (('SDG', SGDClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
	cv_results = model_selection.cross_val_score(model, traindata,df["Flair"], cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.251959 (0.023303)
KNN: 0.277837 (0.012259)
CART: 0.882247 (0.019642)




Rand_Forest: 0.879838 (0.018661)
NB: 0.186159 (0.011322)
SDG: 0.186167 (0.040247)
