In [140]:
from time import time
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from nltk.stem import LancasterStemmer
import pandas as pd

# [ models for classification ]
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


# Prepearing data

In [3]:
def read_data(file_path: str,
              delimiter: str = ",",
              headers: bool = True) -> pd.DataFrame:
    if headers:
        return pd.read_csv(file_path, sep=delimiter)
    return pd.read_csv(file_path, sep=delimiter, header=None)


def clean_sentence(text: str) -> str:
    text = re.sub("@[A-Za-z0-9]+", "", text)
    text = re.sub("#", "", text)
    text = re.sub(r"https?:\S+", "", text)
    letters = list(" qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM")
    for symbol in text:
        if symbol not in letters:
            text = text.replace(symbol, "")
    return text


def clean_sentences(sentences_list: list[str]) -> list[list[str]]:
    """
    Input:
     * sentences_list: list[str] - list of sentences

    Output:
     * list[list[str]] - list of cleansed sentences (without special symbols)

    Example:
        Input: ["Hello my World!",
                "How are you?"]

        Output: [["Hello", "my", "World"],
                 ["How", "are", "you"]]
    """
    cleansed_sentences = []
    for tweet in [clean_sentence(tweet).split(" ")
                  for tweet in sentences_list]:

        cleansed_sentence = [word.strip() for word in tweet
                             if
                             word != "" and
                             "http" not in word and
                             not word.isdigit()]

        cleansed_sentences.append(cleansed_sentence)

    return cleansed_sentences


def flat_lists(sentences_list: list[list[str]]) -> list[str]:
    words = []
    for sentence in sentences_list:
        words += sentence
    return words


In [4]:
# [ loading data ]
all_data_frame = read_data("data/Tweets.csv")
print(all_data_frame)


                 tweet_id airline_sentiment  airline_sentiment_confidence  \
0      570306133677760513           neutral                        1.0000   
1      570301130888122368          positive                        0.3486   
2      570301083672813571           neutral                        0.6837   
3      570301031407624196          negative                        1.0000   
4      570300817074462722          negative                        1.0000   
...                   ...               ...                           ...   
14635  569587686496825344          positive                        0.3487   
14636  569587371693355008          negative                        1.0000   
14637  569587242672398336           neutral                        1.0000   
14638  569587188687634433          negative                        1.0000   
14639  569587140490866689           neutral                        0.6771   

               negativereason  negativereason_confidence         airline  \

In [5]:
airline_sentiment = all_data_frame["airline_sentiment"]
data_frame = pd.DataFrame({"text": all_data_frame["text"]})
print(data_frame)


                                                    text
0                    @VirginAmerica What @dhepburn said.
1      @VirginAmerica plus you've added commercials t...
2      @VirginAmerica I didn't today... Must mean I n...
3      @VirginAmerica it's really aggressive to blast...
4      @VirginAmerica and it's a really big bad thing...
...                                                  ...
14635  @AmericanAir thank you we got on a different f...
14636  @AmericanAir leaving over 20 minutes Late Flig...
14637  @AmericanAir Please bring American Airlines to...
14638  @AmericanAir you have my money, you change my ...
14639  @AmericanAir we have 8 ppl so we need 2 know h...

[14640 rows x 1 columns]


In [6]:
cleaned_data_frame = pd.DataFrame({"text": clean_sentences(data_frame["text"])})
cleansed_words = flat_lists(cleaned_data_frame["text"])
print(cleaned_data_frame)


                                                    text
0                                           [What, said]
1      [plus, youve, added, commercials, to, the, exp...
2      [I, didnt, today, Must, mean, I, need, to, tak...
3      [its, really, aggressive, to, blast, obnoxious...
4      [and, its, a, really, big, bad, thing, about, it]
...                                                  ...
14635  [thank, you, we, got, on, a, different, flight...
14636  [leaving, over, minutes, Late, Flight, No, war...
14637  [Please, bring, American, Airlines, to, BlackB...
14638  [you, have, my, money, you, change, my, flight...
14639  [we, have, ppl, so, we, need, know, how, many,...

[14640 rows x 1 columns]


In [None]:
def lemmatization_sentence(sentence: list[str],
                           lancaster: LancasterStemmer) -> list[str]:

    return [lancaster.stem(word) for word in sentence]


def lemmatization_sentences(sentences: list[list[str]],
                            lancaster: LancasterStemmer) -> list[list[str]]:

    return [lemmatization_sentence(sentence, lancaster) for sentence in sentences]


In [7]:
lancaster = LancasterStemmer()

stemmed_data_frame = pd.DataFrame({"text": lemmatization_sentences(cleaned_data_frame["text"], lancaster)})
print(stemmed_data_frame)


                                                    text
0                                           [what, said]
1       [plu, youv, ad, commerc, to, the, expery, tacky]
2      [i, didnt, today, must, mean, i, nee, to, tak,...
3      [it, real, aggress, to, blast, obnoxy, enterta...
4         [and, it, a, real, big, bad, thing, about, it]
...                                                  ...
14635  [thank, you, we, got, on, a, diff, flight, to,...
14636  [leav, ov, minut, lat, flight, no, warn, or, c...
14637         [pleas, bring, am, airlin, to, blackberry]
14638  [you, hav, my, money, you, chang, my, flight, ...
14639  [we, hav, ppl, so, we, nee, know, how, many, s...

[14640 rows x 1 columns]


In [8]:
stemmed_words = flat_lists(stemmed_data_frame["text"])
bag_of_words = list(set(stemmed_words))
print("Counf of stemmed words:", len(stemmed_words))
print(bag_of_words[:100])


Counf of stemmed words: 233344
['latinam', 'gott', 'desert', 'callback', 'annnnnd', 'elevategold', 'washington', 'circ', 'pandora', 'trit', 'helpa', 'overkil', 'leathers', 'thrilled', 'mirand', 'ic', 'fresh', 'planeso', 'americanair', 'nick', 'happyfriday', 'sel', 'clehelp', 'shaquil', 'institut', 'bledso', 'hopetogetanswersoon', 'sheil', 'inperson', 'troy', 'wheez', 'watson', 'country', 'directb', 'problemss', 'denewr', 'mccarran', 'ward', 'superst', 'upin', 'stick', 'stellarserv', 'mewh', 'priortry', 'brokenwheel', 'myself', 'holy', 'bed', 'trueblu', 'miami', 'ref', 'princesshalf', 'aft', 'chronological', 'fli', 'buy', 'tomorro', 'delyd', 'taxy', 'hel', 'triv', 'ph', 'outpost', 'interview', 'nondelay', 'febru', 'scumb', 'cash', 'facebook', 'shift', 'alcohol', 'k', 'downnnn', 'lifeisgood', 'vil', 'yearround', 'linkemail', 'theworstairlineev', 'jack', 'cough', 'giv', 'nonsens', 'curs', 'piel', 'bei', 'sweresomuchfun', 'laxjfk', 'eqm', 'realtim', 'flgjt', 'justdippin', 'teem', 'disspoin

In [9]:
def sentence_coding(sentence: list[str],
                    bag_of_words: list[str]) -> list[bool]:
    # 1 - word occurs in the bag of words
    # 0 - word does not appear in the sentence
    return [1 if word in sentence else 0 for word in bag_of_words]


def sentences_coding(sentences: list[list[str]],
                     bag_of_words: list[str]) -> list[bool]:
    return [sentence_coding(sentence, bag_of_words) for sentence in sentences]


In [10]:
# [ change words for numbers ]
coded_data_frame = pd.DataFrame(
   sentences_coding(stemmed_data_frame["text"], bag_of_words)
    )

print(coded_data_frame)


       0     1     2     3     4     5     6     7     8     9     ...  8997  \
0         0     0     0     0     0     0     0     0     0     0  ...     0   
1         0     0     0     0     0     0     0     0     0     0  ...     0   
2         0     0     0     0     0     0     0     0     0     0  ...     0   
3         0     0     0     0     0     0     0     0     0     0  ...     0   
4         0     0     0     0     0     0     0     0     0     0  ...     0   
...     ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
14635     0     0     0     0     0     0     0     0     0     0  ...     0   
14636     0     0     0     0     0     0     0     0     0     0  ...     0   
14637     0     0     0     0     0     0     0     0     0     0  ...     0   
14638     0     0     0     0     0     0     0     0     0     0  ...     0   
14639     0     0     0     0     0     0     0     0     0     0  ...     0   

       8998  8999  9000  9001  9002  90

In [12]:
scaler = StandardScaler()

normal_data_frame = pd.DataFrame(
    scaler.fit_transform(coded_data_frame)
    )

print(normal_data_frame)


           0         1         2         3         4         5         6     \
0     -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
1     -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
2     -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
3     -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
4     -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
...         ...       ...       ...       ...       ...       ...       ...   
14635 -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
14636 -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
14637 -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
14638 -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   
14639 -0.008265 -0.032026 -0.011689 -0.037901 -0.008265 -0.008265 -0.029812   

           7         8         9     ...      8997 

In [49]:
X_train, X_test, y_train, y_test = train_test_split(normal_data_frame, airline_sentiment, test_size=0.2)


# MLPClassifier

In [104]:
mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(150, 200, 150, 100, 50),
    batch_size=200,
    max_iter=400,
    random_state=1,
    learning_rate_init=0.01)


In [105]:
mlp_classifier.fit(X_train, y_train)


MLPClassifier(batch_size=200, hidden_layer_sizes=(150, 200, 150, 100, 50),
              learning_rate_init=0.01, max_iter=400, random_state=1)

In [106]:
mlp_y_pred = mlp_classifier.predict(X_test)


In [107]:
print(classification_report(y_test, mlp_y_pred))


              precision    recall  f1-score   support

    negative       0.76      0.90      0.82      1857
     neutral       0.55      0.44      0.49       589
    positive       0.74      0.39      0.51       482

    accuracy                           0.72      2928
   macro avg       0.68      0.58      0.61      2928
weighted avg       0.71      0.72      0.71      2928



# RandomForestClassifier

In [112]:
rf_classifier = RandomForestClassifier()


In [113]:
rf_classifier.fit(X_train, y_train)


RandomForestClassifier()

In [114]:
rf_y_pred = rf_classifier.predict(X_test)


In [115]:
print(classification_report(y_test, rf_y_pred))


              precision    recall  f1-score   support

    negative       0.77      0.96      0.85      1857
     neutral       0.68      0.38      0.49       589
    positive       0.88      0.49      0.63       482

    accuracy                           0.77      2928
   macro avg       0.78      0.61      0.66      2928
weighted avg       0.77      0.77      0.74      2928



# GaussianNB

In [142]:
gauss_model = GaussianNB()


In [143]:
gauss_model.fit(X_train, y_train)


GaussianNB()

In [144]:
gauss_model_y_pred = gauss_model.predict(X_test)


In [145]:
print(classification_report(y_test, gauss_model_y_pred))


              precision    recall  f1-score   support

    negative       0.83      0.34      0.49      1857
     neutral       0.26      0.28      0.27       589
    positive       0.23      0.74      0.36       482

    accuracy                           0.40      2928
   macro avg       0.44      0.46      0.37      2928
weighted avg       0.62      0.40      0.42      2928



# BernoulliNB

In [146]:
bernoulli_model = BernoulliNB()


In [147]:
bernoulli_model.fit(X_train, y_train)


BernoulliNB()

In [148]:
bernoulli_model_y_pred = bernoulli_model.predict(X_test)


In [149]:
print(classification_report(y_test, bernoulli_model_y_pred))


              precision    recall  f1-score   support

    negative       0.83      0.92      0.88      1857
     neutral       0.61      0.55      0.58       589
    positive       0.80      0.58      0.67       482

    accuracy                           0.79      2928
   macro avg       0.75      0.68      0.71      2928
weighted avg       0.78      0.79      0.78      2928



In [150]:
from sklearn.linear_model import LogisticRegression


0.6724726775956285


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


In [None]:
res = logreg.score(X_test, y_test)

print("Logistic Regression accuracy:", res)
