## Importing the Required Libraries

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import pos_tag
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree

## Preparing Training Data

### Importing Training Data

In [None]:
df_train = pd.read_csv('training_twitter_x_y_train.csv')

In [None]:
df_train = df_train[['text', 'airline_sentiment']]

In [None]:
training_data = df_train.values

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### Spliiting the Tweet text into words using NLTK

In [None]:
tweets_train = []
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]), training_data[i][1]])

### Cleaning the Words using WordNetLemmatizer available in NLTK

In [None]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [None]:
from nltk.corpus import wordnet
def get_simple_pos(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
lemmatizer = WordNetLemmatizer()
def clean_tweets(words):
    output_words = []
    for w in words:
        if w.isalpha():
            if w.lower() not in stops:
                pos = pos_tag([w])
                clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
                output_words.append(clean_word.lower())
    return output_words

In [None]:
for i in range(len(tweets_train)):
    tweets_train[i] = (clean_tweets(tweets_train[i][0]), tweets_train[i][1])

In [None]:
y_train = []
tweets = []
for tweet, sentiment in tweets_train:
    tweets.append(" ".join(tweet))
    y_train.append(sentiment)

### Using Count Vectorizer to get the X Train

In [None]:
count_vec = CountVectorizer(max_features=2000) # Tried using n grams but the accuracy was decreasing
x_train_features = count_vec.fit_transform(tweets)

## Prepaing Testing Data

In [None]:
df_test = pd.read_csv('test_twitter_x_test.csv')

In [None]:
testing_data = np.array(df_test['text'])

In [None]:
tweets_test = []
for t in testing_data:
    t = clean_tweets(word_tokenize(t))
    tweets_test.append(" ".join(t))

In [None]:
x_test_features = count_vec.transform(tweets_test)

## Performing Classification

### Support Vector Machine

In [None]:
svc = SVC()
svc.fit(x_train_features, y_train)

In [None]:
y_pred_svm = svc.predict(x_test_features)

In [None]:
df = pd.DataFrame(y_pred_svm)
df.to_csv('predictions_svm.csv', index = False, header = False)

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train_features, y_train)

In [None]:
y_pred_rf = rf.predict(x_test_features)

In [None]:
df = pd.DataFrame(y_pred_rf)
df.to_csv('predictions_rf.csv', index = False, header = False)

### Multinomial Naive Bayes

In [None]:
mnv = MultinomialNB(alpha = 1)
mnv.fit(x_train_features, y_train)

In [None]:
y_pred_mnv = mnv.predict(x_test_features)

In [None]:
df = pd.DataFrame(y_pred_mnv)
df.to_csv('predictions_mnv.csv', index = False, header = False)

### Descision Tree

In [None]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train_features, y_train)

In [None]:
y_pred_dt = dt.predict(x_test_features)

In [None]:
df = pd.DataFrame(y_pred_dt)
df.to_csv('predictions_dt.csv', index = False, header = False)