### Import Dependencies     

In [3]:
import pandas as pd
import numpy as np
import json
import io
import datetime as dt
import string
import re
import spacy
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
nlp = spacy.load('en_core_web_sm')

### Import Dataset

In [4]:
data=pd.read_csv(r'C:\Users\Ikshita\Downloads\Projs\TextPreprocessing\Tweets.csv')

### Dictionary of tweets

In [5]:
#remove any rows that has no tweet text
df =data.iloc[:,[1,2,10]]
df = df.dropna(axis=0, subset=['text'])
#Create dictionary of all tweets
tweetdict = {}
i = 0
for line in df.text:
    tweetdict[i] = line.lower().strip()
    i=i+1
print(tweetdict[13])   

@virginamerica @virginmedia i'm flying your #fabulous #seductive skies again! u take all the #stress away from travel http://t.co/ahlxhhkiyn


### Data Preprocessing

#### Remove @ Mentions

In [6]:
for i in range(0,len(tweetdict)):
    words = tweetdict[i].split(" ")
    no_mention = [ x for x in words if "@" not in x ]
    no_mention = ' '.join(map(str, no_mention))
    tweetdict[i] = no_mention
print(tweetdict[13])

i'm flying your #fabulous #seductive skies again! u take all the #stress away from travel http://t.co/ahlxhhkiyn


#### Remove # Hashtags

In [7]:
for i in range(0,len(tweetdict)):
    words = tweetdict[i].split(" ")
    no_mention = [ x for x in words if "#" not in x ]
    tweetdict[i] = ' '.join(map(str, no_mention))
print(tweetdict[13])

i'm flying your skies again! u take all the away from travel http://t.co/ahlxhhkiyn


#### Remove Hyperlinks and Emojis

In [8]:
for i in range(0,len(tweetdict)):
    tweetdict[i] = re.sub(r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', '', tweetdict[i], flags=re.MULTILINE)
    tweetdict[i] = re.sub('\W+', ' ', tweetdict[i]) #Jus keep letters and numbers
    tweetdict[i] = re.sub('\d+', ' ', tweetdict[i]) #Remove numbers
print(tweetdict[13])

i m flying your skies again u take all the away from travel 


#### Remove 'RT' Designation

In [9]:
for i in range(0,len(df)):
    tweetdict[i] = tweetdict[i].replace('RT', '').strip()

#### Tokenization

In [10]:
for i in range(0,len(tweetdict)):
    tweetdict[i] = WhitespaceTokenizer().tokenize(tweetdict[i])
print(tweetdict[13]) 

['i', 'm', 'flying', 'your', 'skies', 'again', 'u', 'take', 'all', 'the', 'away', 'from', 'travel']


#### Lemmatization

In [11]:
lemmatizer = WordNetLemmatizer()
for i in range(0,len(tweetdict)):
    words = tweetdict[i]
    for index, word in enumerate(words):
        words[index] = lemmatizer.lemmatize(word)
        tweetdict[i] = words
print(tweetdict[13])   

['i', 'm', 'flying', 'your', 'sky', 'again', 'u', 'take', 'all', 'the', 'away', 'from', 'travel']


#### Remove Stopwords

In [12]:
from nltk.corpus import stopwords
en_stopwords = list(stopwords.words('english'))
for i in range(0,len(tweetdict)):
    nostp = [word for word in tweetdict[i] if word.lower() not in en_stopwords]
    tweetdict[i] = ' '.join(map(str, nostp))
print(tweetdict[13])

flying sky u take away travel


In [13]:
tweets=[]
for i in range(0,len(tweetdict)):
    tweets.append(tweetdict[i])

#### Term Frequency - Inverse Document Frequency

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(strip_accents=None,
                            max_df=0.8,
                            min_df=15,
                            lowercase=True,
                            stop_words="english",
                            sublinear_tf=True,
                            analyzer='word')
dtm = vect.fit_transform(tweets) 
# create Document Term Matrix
df_dtm = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
df_dtm.head(5)

Unnamed: 0,aa,able,absolute,absolutely,accept,acceptable,access,accommodate,account,act,...,wtf,yall,yeah,year,yep,yes,yesterday,yo,yr,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [16]:
df_labels =data.iloc[:,[1,9]]
datafr = pd.concat([df_labels.reset_index(drop=True),df_dtm.reset_index(drop=True)], axis=1)
datafr.head(5)

Unnamed: 0,airline_sentiment,retweet_count,aa,able,absolute,absolutely,accept,acceptable,access,accommodate,...,wtf,yall,yeah,year,yep,yes,yesterday,yo,yr,zero
0,neutral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,positive,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,neutral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,negative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,negative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score,precision_score ,confusion_matrix,recall_score


## Decision Tree Classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier
x = datafr.values[:,1:]                                                     # retweet_count + Doc-Term Matrix
y = datafr.values[:,0]                                                      # airline_sentiment
kf = KFold(n_splits=5)                                                     # K-Folds = 10 
kf.get_n_splits(x)
accuracy=[]
yt = []
yp = []
for train_index, test_index in kf.split(x):                                 # For each train and test data
    x_train, x_test = x[train_index], x[test_index]                                   
    y_train, y_test = y[train_index], y[test_index]
    sc_x=StandardScaler()
    x_train=sc_x.fit_transform(x_train)
    x_test=sc_x.transform(x_test)
    classifier =DecisionTreeClassifier(criterion='entropy')                  # Decision Tree Classifier
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    cm=confusion_matrix(y_test,y_pred)
    print('===============================================================================')
    print('Prediction: ', y_pred)
    print('Correct:    ', y_test)
    print("Confusion Matrix:")
    print(cm)
    acc= accuracy_score(y_test,y_pred)*100
    print("Accuracy is: %0.3f" % acc)                                        # Accuracy in each 10 folds
    accuracy.append(acc)
    pr = precision_score(y_test, y_pred,average=None)
    print("Precision :", pr)                                                 # Precision in each 10 folds
    recscr = recall_score(y_test, y_pred,average=None)
    print("Recall :",  recscr)                                               # Recall in each 10 folds
    f1 = f1_score(y_test, y_pred,average=None)
    print("F1-Score : ",  f1)                                                # F1-Score in each 10 folds
    yt += list(y_test)
    yp += list(y_pred)

avg_accu = np.mean(accuracy)
print("Average Accuracy: %0.3f" % avg_accu)                                  # Average Accuracy across 10 folds
print(classification_report(yt, yp))                                         # Average Accuracy Precision Recall F1-Score across 10 folds




Prediction:  ['negative' 'neutral' 'neutral' ..., 'neutral' 'negative' 'negative']
Correct:     ['neutral' 'positive' 'neutral' ..., 'negative' 'negative' 'negative']
Confusion Matrix:
[[1515  258  126]
 [ 252  251   82]
 [ 140   72  232]]
Accuracy is: 68.238
Precision : [ 0.79444153  0.43201377  0.52727273]
Recall : [ 0.79778831  0.42905983  0.52252252]
F1-Score :  [ 0.7961114   0.43053173  0.52488688]
Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'neutral' 'neutral']
Correct:     ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'neutral']
Confusion Matrix:
[[1336  236  128]
 [ 324  283   79]
 [ 162  103  277]]
Accuracy is: 64.754
Precision : [ 0.73326015  0.45498392  0.57231405]
Recall : [ 0.78588235  0.41253644  0.51107011]
F1-Score :  [ 0.75865985  0.43272171  0.53996101]
Prediction:  ['positive' 'negative' 'positive' ..., 'negative' 'negative' 'positive']
Correct:     ['neutral' 'neutral' 'positive' ..., 'negative' 'negative' 'positive']
Confusion Matr

### Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
x = datafr.values[:,1:]                                                     # retweet_count + Doc-Term Matrix
y = datafr.values[:,0]                                                      # airline_sentiment
kf = KFold(n_splits=5)                                                     # K-Folds = 10 
kf.get_n_splits(x)
accuracy=[]
yt = []
yp = []
for train_index, test_index in kf.split(x):                                 # For each train and test data
    x_train, x_test = x[train_index], x[test_index]                                   
    y_train, y_test = y[train_index], y[test_index]
    sc_x=StandardScaler()
    x_train=sc_x.fit_transform(x_train)
    x_test=sc_x.transform(x_test)
    classifier =RandomForestClassifier(n_estimators = 10, criterion = 'entropy')     # Random Forest Classifier
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    cm=confusion_matrix(y_test,y_pred)
    print('===============================================================================')
    print('Prediction: ', y_pred)
    print('Correct:    ', y_test)
    print("Confusion Matrix:")
    print(cm)
    acc= accuracy_score(y_test,y_pred)*100
    print("Accuracy is: %0.3f" % acc)                                        # Accuracy in each 10 folds
    accuracy.append(acc)
    pr = precision_score(y_test, y_pred,average=None)
    print("Precision :", pr)                                                 # Precision in each 10 folds
    recscr = recall_score(y_test, y_pred,average=None)
    print("Recall :",  recscr)                                               # Recall in each 10 folds
    f1 = f1_score(y_test, y_pred,average=None)
    print("F1-Score : ",  f1)                                                # F1-Score in each 10 folds
    yt += list(y_test)
    yp += list(y_pred)

avg_accu = np.mean(accuracy)
print("Average Accuracy: %0.3f" % avg_accu)                                  # Average Accuracy across 10 folds
print(classification_report(yt, yp))                                         # Average Accuracy Precision Recall F1-Score across 10 folds




Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'negative']
Correct:     ['neutral' 'positive' 'neutral' ..., 'negative' 'negative' 'negative']
Confusion Matrix:
[[1692  147   60]
 [ 279  257   49]
 [ 158   51  235]]
Accuracy is: 74.590
Precision : [ 0.79473931  0.56483516  0.68313953]
Recall : [ 0.89099526  0.43931624  0.52927928]
F1-Score :  [ 0.84011917  0.49423077  0.5964467 ]




Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'neutral' 'neutral']
Correct:     ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'neutral']
Confusion Matrix:
[[1534  103   63]
 [ 348  274   64]
 [ 202   68  272]]
Accuracy is: 71.038
Precision : [ 0.73608445  0.61573034  0.68170426]
Recall : [ 0.90235294  0.39941691  0.50184502]
F1-Score :  [ 0.81078224  0.48452697  0.5781084 ]




Prediction:  ['positive' 'negative' 'positive' ..., 'negative' 'negative' 'positive']
Correct:     ['neutral' 'neutral' 'positive' ..., 'negative' 'negative' 'positive']
Confusion Matrix:
[[1124  123   42]
 [ 413  408   96]
 [ 263   94  365]]
Accuracy is: 64.788
Precision : [ 0.62444444  0.6528      0.72564612]
Recall : [ 0.87199379  0.44492912  0.50554017]
F1-Score :  [ 0.72774361  0.52918288  0.59591837]




Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'positive']
Correct:     ['negative' 'neutral' 'negative' ..., 'negative' 'negative' 'positive']
Confusion Matrix:
[[1975  151   77]
 [ 225  154   50]
 [  95   23  178]]
Accuracy is: 78.791
Precision : [ 0.86056645  0.4695122   0.58360656]
Recall : [ 0.89650477  0.35897436  0.60135135]
F1-Score :  [ 0.87816807  0.40686922  0.59234609]




Prediction:  ['negative' 'negative' 'positive' ..., 'negative' 'negative' 'negative']
Correct:     ['negative' 'negative' 'positive' ..., 'neutral' 'negative' 'neutral']
Confusion Matrix:
[[1835  186   66]
 [ 241  186   55]
 [ 110   40  209]]
Accuracy is: 76.161
Precision : [ 0.83943275  0.45145631  0.63333333]
Recall : [ 0.87925252  0.38589212  0.5821727 ]
F1-Score :  [ 0.85888135  0.41610738  0.60667634]
Average Accuracy: 73.074
             precision    recall  f1-score   support

   negative       0.78      0.89      0.83      9178
    neutral       0.56      0.41      0.48      3099
   positive       0.67      0.53      0.59      2363

avg / total       0.72      0.73      0.72     14640



### kernel- Suppport Vector Machine

In [20]:
from sklearn.svm import SVC
x = datafr.values[:,1:]                                                     # retweet_count + Doc-Term Matrix
y = datafr.values[:,0]                                                      # airline_sentiment
kf = KFold(n_splits=5)                                                     # K-Folds = 10 
kf.get_n_splits(x)
accuracy=[]
yt = []
yp = []
for train_index, test_index in kf.split(x):                                 # For each train and test data
    x_train, x_test = x[train_index], x[test_index]                                   
    y_train, y_test = y[train_index], y[test_index]
    sc_x=StandardScaler()
    x_train=sc_x.fit_transform(x_train)
    x_test=sc_x.transform(x_test)
    classifier =SVC(kernel = 'rbf',random_state=0,degree=2)                 # SVM Classifier
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    cm=confusion_matrix(y_test,y_pred)
    print('===============================================================================')
    print('Prediction: ', y_pred)
    print('Correct:    ', y_test)
    print("Confusion Matrix:")
    print(cm)
    acc= accuracy_score(y_test,y_pred)*100
    print("Accuracy is: %0.3f" % acc)                                        # Accuracy in each 10 folds
    accuracy.append(acc)
    pr = precision_score(y_test, y_pred,average=None)
    print("Precision :", pr)                                                 # Precision in each 10 folds
    recscr = recall_score(y_test, y_pred,average=None)
    print("Recall :",  recscr)                                               # Recall in each 10 folds
    f1 = f1_score(y_test, y_pred,average=None)
    print("F1-Score : ",  f1)                                                # F1-Score in each 10 folds
    yt += list(y_test)
    yp += list(y_pred)

avg_accu = np.mean(accuracy)
print("Average Accuracy: %0.3f" % avg_accu)                                  # Average Accuracy across 10 folds
print(classification_report(yt, yp))                                         # Average Accuracy Precision Recall F1-Score across 10 folds




Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'negative']
Correct:     ['neutral' 'positive' 'neutral' ..., 'negative' 'negative' 'negative']
Confusion Matrix:
[[1790   82   27]
 [ 372  178   35]
 [ 214   60  170]]
Accuracy is: 73.019
Precision : [ 0.753367    0.55625     0.73275862]
Recall : [ 0.94260137  0.3042735   0.38288288]
F1-Score :  [ 0.8374269   0.39337017  0.50295858]
Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'neutral' 'negative']
Correct:     ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'neutral']
Confusion Matrix:
[[1602   68   30]
 [ 461  191   34]
 [ 259   63  220]]
Accuracy is: 68.750
Precision : [ 0.68992248  0.5931677   0.77464789]
Recall : [ 0.94235294  0.27842566  0.40590406]
F1-Score :  [ 0.7966186   0.37896825  0.53268765]
Prediction:  ['negative' 'negative' 'positive' ..., 'negative' 'negative' 'positive']
Correct:     ['neutral' 'neutral' 'positive' ..., 'negative' 'negative' 'positive']
Confusion 

### k - Nearest Neighbour

In [21]:
from sklearn.neighbors import KNeighborsClassifier
x = datafr.values[:,1:]                                                     # retweet_count + Doc-Term Matrix
y = datafr.values[:,0]                                                      # airline_sentiment
kf = KFold(n_splits=5)                                                     # K-Folds = 10 
kf.get_n_splits(x)
accuracy=[]
yt = []
yp = []
for train_index, test_index in kf.split(x):                                 # For each train and test data
    x_train, x_test = x[train_index], x[test_index]                                   
    y_train, y_test = y[train_index], y[test_index]
    sc_x=StandardScaler()
    x_train=sc_x.fit_transform(x_train)
    x_test=sc_x.transform(x_test)
    classifier =KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)                 # k-NN Classifier
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    cm=confusion_matrix(y_test,y_pred)
    print('===============================================================================')
    print('Prediction: ', y_pred)
    print('Correct:    ', y_test)
    print("Confusion Matrix:")
    print(cm)
    acc= accuracy_score(y_test,y_pred)*100
    print("Accuracy is: %0.3f" % acc)                                        # Accuracy in each 10 folds
    accuracy.append(acc)
    pr = precision_score(y_test, y_pred,average=None)
    print("Precision :", pr)                                                 # Precision in each 10 folds
    recscr = recall_score(y_test, y_pred,average=None)
    print("Recall :",  recscr)                                               # Recall in each 10 folds
    f1 = f1_score(y_test, y_pred,average=None)
    print("F1-Score : ",  f1)                                                # F1-Score in each 10 folds
    yt += list(y_test)
    yp += list(y_pred)

avg_accu = np.mean(accuracy)
print("Average Accuracy: %0.3f" % avg_accu)                                  # Average Accuracy across 10 folds
print(classification_report(yt, yp))                                         # Average Accuracy Precision Recall F1-Score across 10 folds




Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'negative']
Correct:     ['neutral' 'positive' 'neutral' ..., 'negative' 'negative' 'negative']
Confusion Matrix:
[[1442  343  114]
 [ 341  172   72]
 [ 216   98  130]]
Accuracy is: 59.563
Precision : [ 0.72136068  0.28058728  0.41139241]
Recall : [ 0.75934702  0.29401709  0.29279279]
F1-Score :  [ 0.7398666   0.28714524  0.34210526]
Prediction:  ['negative' 'negative' 'negative' ..., 'negative' 'neutral' 'neutral']
Correct:     ['negative' 'negative' 'negative' ..., 'negative' 'negative' 'neutral']
Confusion Matrix:
[[1315  296   89]
 [ 364  265   57]
 [ 232  139  171]]
Accuracy is: 59.802
Precision : [ 0.6881214   0.37857143  0.53943218]
Recall : [ 0.77352941  0.38629738  0.31549815]
F1-Score :  [ 0.7283301   0.38239538  0.39813737]
Prediction:  ['negative' 'positive' 'positive' ..., 'negative' 'negative' 'positive']
Correct:     ['neutral' 'neutral' 'positive' ..., 'negative' 'negative' 'positive']
Confusion M