In [1]:
import pandas as pd
import nltk
import re
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv("training.csv", encoding='latin-1')

In [3]:
df_test = pd.read_csv("test.csv", encoding='latin-1')
df_test

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def clean_tokenized_lemmatized(tweet):
    tweet = tweet.lower()                                                       #converting the text into lower
    tweet = re.sub(r'@\w+', '', tweet)                                          #removing @mentions
    tweet = re.sub(r'#\w+', '', tweet)                                          #removing hashtags
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)   #removing url          
    tweet = re.sub(r'[^\w\s]','',tweet)                                         #removing punctuations
    tweet = re.sub(r'\d+', '', tweet)                                           #removing numbers
    tweet = re.sub(r'\s+', ' ', tweet).strip()                                  #removing extra whitespaces  
    
    lemmatizer = WordNetLemmatizer()
    stops = stopwords.words('english')
    
    tokens = nltk.word_tokenize(tweet)
    tokens = [t for t in tokens if not t in stops]
    fintokens = []
    for token in tokens:
        fintokens.append(lemmatizer.lemmatize(token))
    finaltext = " "
    return finaltext.join(fintokens)

for i in df_train.index:
    string = df_train['text'][i]
    preprostr = clean_tokenized_lemmatized(string)
    df_train.at[i, 'text'] = preprostr

for i in df_test.index:
    preprostr = clean_tokenized_lemmatized(string)
    df_test.at[i, 'text'] = preprostr


In [5]:
def emotion_detection(value):
    if value == 0:
        return "sadness"
    elif value == 1:
        return "joy"
    elif value == 2:
        return "love"
    elif value == 3:
        return "anger"
    elif value == 4:
        return "fear"
df_train['emotion'] = df_train['label'].map(emotion_detection)
df_test['emotion'] = df_test['label'].map(emotion_detection)

In [6]:
df_test

Unnamed: 0,text,label,emotion
0,feel like wan na buy cute make see online even...,0,sadness
1,feel like wan na buy cute make see online even...,0,sadness
2,feel like wan na buy cute make see online even...,0,sadness
3,feel like wan na buy cute make see online even...,1,joy
4,feel like wan na buy cute make see online even...,0,sadness
...,...,...,...
1995,feel like wan na buy cute make see online even...,3,anger
1996,feel like wan na buy cute make see online even...,3,anger
1997,feel like wan na buy cute make see online even...,1,joy
1998,feel like wan na buy cute make see online even...,1,joy


In [7]:
positive = ['joy', 'love']
negative = ['sadness', 'fear', 'anger']
def emotion_detection(value):
    if value in positive:
        return 1
    elif value in negative:
        return 0
    else:
        return 2
df_train['sentiment'] = df_train['emotion'].map(emotion_detection)
df_test['sentiment'] = df_test['emotion'].map(emotion_detection)

In [10]:
#df_train.drop('emotion_n')
#df_test.drop('emotion_n')

In [11]:
df_train.label.value_counts()

1    6066
0    5216
3    2434
4    2149
2    1482
5     653
Name: label, dtype: int64

In [12]:
df_test.label.value_counts()

1    695
0    581
3    275
4    224
2    159
5     66
Name: label, dtype: int64

In [13]:
df_test.label.value_counts()

1    695
0    581
3    275
4    224
2    159
5     66
Name: label, dtype: int64

In [14]:
df_train.label.value_counts()

1    6066
0    5216
3    2434
4    2149
2    1482
5     653
Name: label, dtype: int64

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [16]:
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),
     ('SVM', LinearSVC())         
])

In [17]:
clf2 = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),
     ('DT', DecisionTreeClassifier())         
])

In [18]:
clf3 = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),
     ('NB', MultinomialNB())         
])

In [19]:
clf4 = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),
     ('NB', XGBClassifier())         
])

In [20]:
clf.fit(df_train.text, df_train.label)

Pipeline(steps=[('vectorizer_bow', CountVectorizer(ngram_range=(1, 3))),
                ('SVM', LinearSVC())])

In [21]:
clf2.fit(df_train.text, df_train.label)

Pipeline(steps=[('vectorizer_bow', CountVectorizer(ngram_range=(1, 3))),
                ('DT', DecisionTreeClassifier())])

In [22]:
clf3.fit(df_train.text, df_train.label)

Pipeline(steps=[('vectorizer_bow', CountVectorizer(ngram_range=(1, 3))),
                ('NB', MultinomialNB())])

In [23]:
clf4.fit(df_train.text, df_train.label)

Pipeline(steps=[('vectorizer_bow', CountVectorizer(ngram_range=(1, 3))),
                ('NB',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None

In [24]:
y_pred = clf.predict(df_test.text)

In [25]:
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [26]:
set(df_test) - set(y_pred)

{'emotion', 'label', 'sentiment', 'text'}

In [30]:
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       581
           1       0.35      1.00      0.52       695
           2       0.00      0.00      0.00       159
           3       0.00      0.00      0.00       275
           4       0.00      0.00      0.00       224
           5       0.00      0.00      0.00        66

    accuracy                           0.35      2000
   macro avg       0.06      0.17      0.09      2000
weighted avg       0.12      0.35      0.18      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
y_pred = clf2.predict(df_test.text)

In [32]:
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       581
           1       0.35      1.00      0.52       695
           2       0.00      0.00      0.00       159
           3       0.00      0.00      0.00       275
           4       0.00      0.00      0.00       224
           5       0.00      0.00      0.00        66

    accuracy                           0.35      2000
   macro avg       0.06      0.17      0.09      2000
weighted avg       0.12      0.35      0.18      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
y_pred = clf3.predict(df_test.text)

In [34]:
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       581
           1       0.35      1.00      0.52       695
           2       0.00      0.00      0.00       159
           3       0.00      0.00      0.00       275
           4       0.00      0.00      0.00       224
           5       0.00      0.00      0.00        66

    accuracy                           0.35      2000
   macro avg       0.06      0.17      0.09      2000
weighted avg       0.12      0.35      0.18      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
y_pred = clf4.predict(df_test.text)

In [36]:
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       581
           1       0.35      1.00      0.52       695
           2       0.00      0.00      0.00       159
           3       0.00      0.00      0.00       275
           4       0.00      0.00      0.00       224
           5       0.00      0.00      0.00        66

    accuracy                           0.35      2000
   macro avg       0.06      0.17      0.09      2000
weighted avg       0.12      0.35      0.18      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
from joblib import Parallel, delayed
import joblib

In [28]:
joblib.dump(clf, 'svm_model.pkl')

['svm_model.pkl']