In [1]:
import pandas as pd
import re

from nltk import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer 

import warnings
warnings.filterwarnings('ignore')


col = ['target', 'id', 'date', 'flag', 'user', 'text']

df = pd.read_csv(r'training.1600000.processed.noemoticon.csv', header = None, names = col,  encoding='latin-1')


df.dropna()

df = df.sample(n=50000)

# Handle Categories variable
df['target'] = df['target'].replace(4,1)


df['text'].head()

1335657            @LaDii_Trice LOL calm ya hyper ass down. 
331743     Thinking of you, your company, your eyes, your...
504703        is working late. Recovering from stomach bug. 
1113931    Listening to good music, laying out by the poo...
1002855    @princessa_the1 It would be great if we went o...
Name: text, dtype: object

In [2]:

def get_part_of_day(h):
        if   5 <= h <= 11:
            return 'morning'
        elif 12 <= h <= 17:
            return 'afternoon'
        elif 18 <= h <= 22:
            return 'evening'
        else:
            return 'night'


day = []
month = []
dayInMonth = []
timeOfTweet = []
timePeriod = []
year = []

for d in df['date']:
    sub = d.split(' ')
    day.append(sub[0])
    dayInMonth.append(sub[2])
    timePeriod.append(get_part_of_day(int(sub[3][:2])))

    # time zones is only pdt
    # timeZone.append(sub[4])

    # year is only 2009
    # year.append(sub[5])



data = pd.DataFrame()

data['day'] = day
data['dayInMonth'] = dayInMonth
data['partOfDay'] = timePeriod

data = pd.get_dummies(data, columns=['day','partOfDay'])



In [3]:
data['target'] = df['target'].values
data['text'] = df['text'].values

df = data

data.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,partOfDay_morning,partOfDay_night,target,text
0,3,0,0,0,0,0,0,1,0,0,1,0,1,@LaDii_Trice LOL calm ya hyper ass down.
1,2,0,0,0,0,0,1,0,0,1,0,0,0,"Thinking of you, your company, your eyes, your..."
2,15,0,1,0,0,0,0,0,0,1,0,0,0,is working late. Recovering from stomach bug.
3,30,0,0,1,0,0,0,0,0,0,1,0,1,"Listening to good music, laying out by the poo..."
4,22,1,0,0,0,0,0,0,0,0,0,1,1,@princessa_the1 It would be great if we went o...


In [4]:
print("starting text clean")

# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()



ps = PorterStemmer()
sb = SnowballStemmer(language='english')


all_stopwords = stopwords.words('english')

corpus = []
corpus1 = []



for sen in df['text']:
    # remove hashtags
    sen = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", ' ', sen)
#     sen = re.sub('(@|#|&)+([a-zA-Z0-9_]+)', '', sen)
    # remove weird chars
    sen = re.sub('[^a-zA-z\'\"]+', ' ', sen)
    # remove urls
    sen = re.sub(r'\$\w*', '', sen)
    # remove old style retweet text "RT"
    sen = re.sub(r'^RT[\s]+', '', sen)
    # remove hyperlinks
    sen = re.sub(r'https?:\/\/.*[\r\n]*', '', sen)
    # only removing the hash # sign from the word
    sen = re.sub(r'#', '', sen)

    

    sen = sen.lower()
    sen = sen.split()

    


    
            # porter stemmer vs snowball stemmer
#     text = [ps.stem(word) for word in sen if not word in set(all_stopwords)]

        # lemmtization vs stemming    (word meaning vs stem)
#   text1 = [sb.stem(word) for word in sen if not word in set(all_stopwords)]
    text1 = [lemmatizer.lemmatize(word) for word in sen if not word in set(all_stopwords)]
    
    
#     text = ' '.join(text)
    text1 = ' '.join(text1)
    
    
#     corpus.append(text)
    corpus1.append(text1)



starting text clean


In [5]:
x = pd.DataFrame({'col':corpus1})
x.head()

Unnamed: 0,col
0,lol calm ya hyper as
1,thinking company eye personality amazingness g...
2,working late recovering stomach bug
3,listening good music laying pool great jeeeeezy
4,would great went date


In [6]:
y = df['target'].values

df = pd.concat([df, x], axis=1, join='inner')
df = df.drop(['target','text'],axis=1)

df.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,partOfDay_morning,partOfDay_night,col
0,3,0,0,0,0,0,0,1,0,0,1,0,lol calm ya hyper as
1,2,0,0,0,0,0,1,0,0,1,0,0,thinking company eye personality amazingness g...
2,15,0,1,0,0,0,0,0,0,1,0,0,working late recovering stomach bug
3,30,0,0,1,0,0,0,0,0,0,1,0,listening good music laying pool great jeeeeezy
4,22,1,0,0,0,0,0,0,0,0,0,1,would great went date


In [7]:
X_train, X_test,y_train, y_test = train_test_split(df, y, test_size=0.2, random_state = 42)

In [8]:

from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences

token = Tokenizer()


seq = token.texts_to_sequences(X_train['col'])
pad_seq = pad_sequences(seq,maxlen=300)


In [9]:

from tqdm import tqdm
import numpy as np


vocab_size = len(token.word_index)+1


embedding_vector = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

2196018it [03:03, 11956.70it/s]


In [10]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in token.word_index.items():
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [11]:
from keras.layers import Embedding
embedding_layer = Embedding(300,
                            300,
                            weights=[embedding_matrix],
                            input_length=500,
                            trainable=False)

In [12]:
seq1 = token.texts_to_sequences(X_test['col'])
pad_seq1 = pad_sequences(seq1,maxlen=300)


x_train = pad_seq
x_test= pad_seq1


In [13]:

x_train1 = pd.DataFrame(x_train)
x_test1 = pd.DataFrame(x_test)

In [14]:
x_train1.shape

(40000, 300)

In [15]:
X_train.shape

(40000, 13)

In [16]:
X_train = X_train.drop(['col'],axis=1)
X_test = X_test.drop(['col'],axis=1)

In [17]:
X_train = X_train.reset_index()
x_train1 = x_train1.reset_index()
X_test = X_test.reset_index()
x_test1 = x_test1.reset_index()


# print(x_train1.shape)
X_train1 = pd.concat([X_train, x_train1], axis=1)
X_test1 = pd.concat([X_test, x_test1], axis=1)

In [18]:
# models

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier


DecTr = DecisionTreeClassifier()
AdaBoost = AdaBoostClassifier()
Knn = KNeighborsClassifier()

DecTr1 = DecisionTreeClassifier()
AdaBoost1 = AdaBoostClassifier()
Knn1 = KNeighborsClassifier()


In [19]:
print("fitting extended data")
DecTr.fit(X_train,y_train)
AdaBoost.fit(X_train,y_train)
Knn.fit(X_train,y_train)

fitting extended data


KNeighborsClassifier()

In [20]:
print("fitting only text")
DecTr1.fit(X_train1,y_train)
AdaBoost1.fit(X_train1,y_train)
Knn1.fit(X_train1,y_train)

fitting only text


KNeighborsClassifier()

In [21]:
print("predicting extended")
predictionsa = DecTr.predict(X_test)
predictionsb = AdaBoost.predict(X_test)
predictionsc = Knn.predict(X_test)

predicting extended


In [22]:
print("predicting original")
predictions1a = DecTr1.predict(X_test1)
predictions1b = AdaBoost1.predict(X_test1)
predictions1c = Knn1.predict(X_test1)

predicting original


In [23]:
print("Decision tree\n")

print("extended:")
print(confusion_matrix(y_test,predictionsa))
print(classification_report(y_test,predictionsa))
print(accuracy_score(y_test,predictionsa))

print("original:")
print(confusion_matrix(y_test,predictions1a))
print(classification_report(y_test,predictions1a))
print(accuracy_score(y_test,predictions1a))

Decision tree

extended:
[[3033 1926]
 [2012 3029]]
              precision    recall  f1-score   support

           0       0.60      0.61      0.61      4959
           1       0.61      0.60      0.61      5041

    accuracy                           0.61     10000
   macro avg       0.61      0.61      0.61     10000
weighted avg       0.61      0.61      0.61     10000

0.6062
original:
[[2951 2008]
 [2037 3004]]
              precision    recall  f1-score   support

           0       0.59      0.60      0.59      4959
           1       0.60      0.60      0.60      5041

    accuracy                           0.60     10000
   macro avg       0.60      0.60      0.60     10000
weighted avg       0.60      0.60      0.60     10000

0.5955


In [24]:
print("Adaboost\n")

print("extended:")
print(confusion_matrix(y_test,predictionsb))
print(classification_report(y_test,predictionsb))
print(accuracy_score(y_test,predictionsb))

print("original:")
print(confusion_matrix(y_test,predictions1b))
print(classification_report(y_test,predictions1b))
print(accuracy_score(y_test,predictions1b))

Adaboost

extended:
[[2585 2374]
 [1389 3652]]
              precision    recall  f1-score   support

           0       0.65      0.52      0.58      4959
           1       0.61      0.72      0.66      5041

    accuracy                           0.62     10000
   macro avg       0.63      0.62      0.62     10000
weighted avg       0.63      0.62      0.62     10000

0.6237
original:
[[2581 2378]
 [1389 3652]]
              precision    recall  f1-score   support

           0       0.65      0.52      0.58      4959
           1       0.61      0.72      0.66      5041

    accuracy                           0.62     10000
   macro avg       0.63      0.62      0.62     10000
weighted avg       0.63      0.62      0.62     10000

0.6233


In [25]:
print("KNN\n")

print("extended:")
print(confusion_matrix(y_test,predictionsc))
print(classification_report(y_test,predictionsc))
print(accuracy_score(y_test,predictionsc))

print("original:")
print(confusion_matrix(y_test,predictions1c))
print(classification_report(y_test,predictions1c))
print(accuracy_score(y_test,predictions1c))

KNN

extended:
[[2802 2157]
 [2412 2629]]
              precision    recall  f1-score   support

           0       0.54      0.57      0.55      4959
           1       0.55      0.52      0.54      5041

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.54      0.54      0.54     10000

0.5431
original:
[[2553 2406]
 [2483 2558]]
              precision    recall  f1-score   support

           0       0.51      0.51      0.51      4959
           1       0.52      0.51      0.51      5041

    accuracy                           0.51     10000
   macro avg       0.51      0.51      0.51     10000
weighted avg       0.51      0.51      0.51     10000

0.5111
