In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
import string
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
df_train = pd.read_csv('training_twitter_x_y_train.csv')
df_test = pd.read_csv('test_twitter_x_test.csv')

In [3]:
text_train = df_train['text']
text_test = df_test['text']

In [4]:
# extracted all the words
words_train = [word_tokenize(tweet)[2:] for tweet in text_train]
words_test = [word_tokenize(tweet)[2:] for tweet in text_test]

In [5]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [6]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADJ
    return wordnet.NOUN

In [7]:
lemmatizer = WordNetLemmatizer()
def clean_tweet(tweet):
    output_tweet = []
    for w in tweet:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_tweet.append(clean_word.lower())
    return output_tweet

In [8]:
words_train_clean = [clean_tweet(tweet) for tweet in words_train]
words_test_clean = [clean_tweet(tweet) for tweet in words_test]

In [9]:
text_train_clean = [" ".join(tweet) for tweet in words_train_clean]
text_test_clean = [" ".join(tweet) for tweet in words_test_clean]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
count_vec = CountVectorizer(max_features=2000, ngram_range=(1,2))
a = count_vec.fit_transform(text_train_clean)
b = count_vec.transform(text_test_clean)
x_train = a.todense()
x_test = b.todense()

In [12]:
y_train = df_train['airline_sentiment']

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)

MultinomialNB()

In [14]:
clf.score(x_train, y_train)

0.7892531876138433

In [15]:
from sklearn.svm import SVC
clf_svm = SVC()
clf_svm.fit(x_train, y_train)

SVC()

In [16]:
clf_svm.score(x_train, y_train)

0.8967213114754098

In [17]:
print(len(x_train))
print(len(y_train))

10980
10980


In [18]:
y_pred = clf_svm.predict(x_test)

In [19]:
import numpy as np

In [20]:
df_ans = pd.DataFrame(y_pred)
df_ans

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,positive
...,...
3655,negative
3656,negative
3657,negative
3658,positive


In [21]:
df_ans.to_csv('twitter_pred.csv',header=False,index=False)