In [1]:
import pandas as pd
import numpy as np
import re

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
train = pd.read_csv("training_twitter_x_y_train.csv")
test = pd.read_csv("test_twitter_x_test.csv")

In [9]:
drop_cols = ['airline_sentiment_gold','name','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location']
train.drop(drop_cols, axis = 1, inplace = True)
test.drop(drop_cols, axis = 1, inplace = True)

In [10]:
stops = stopwords.words('english')
stops += list(punctuation)
stops += ['flight', 'airline', 'flights', 'AA']

In [11]:
abbreviations = {'ppl': 'people','cust':'customer','serv':'service','mins':'minutes','hrs':'hours','svc': 'service',
           'u':'you','pls':'please'}
train_index = train[~train.negativereason_gold.isna()].index
test_index = test[~test.negativereason_gold.isna()].index

In [None]:
# train.negativereason_gold.isna():

# This checks whether each value in the negativereason_gold column of the train DataFrame is NaN (missing).
# It returns a boolean Series where True represents missing values, and False represents non-missing values.
# ~train.negativereason_gold.isna():

# The tilde (~) operator is a bitwise NOT operator. It inverts the boolean Series.
# ~ turns True to False and False to True.
# Now, True corresponds to rows where negativereason_gold is not missing.
# train[~train.negativereason_gold.isna()]:

# This filters the train DataFrame, selecting only rows where negativereason_gold is not missing.
# .index:

# The .index attribute retrieves the indices (row labels) of the filtered DataFrame.
# These indices can later be used to reference specific rows.


In [15]:
for index, row in train.iterrows():
    tweet = row.text
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub(r'@[^\s]+','',tweet) #remove usernames
    tweet = re.sub(r'[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words = []
    for word in tweet.split():
        if word.lower() not in stops:
            if word in list(abbreviations.keys()):
                words.append(abbreviations[word])
            else:
                words.append(word.lower())
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in train_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

In [16]:
for index, row in test.iterrows():
    tweet = row.text
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub(r'@[^\s]+','',tweet) #remove usernames
    tweet = re.sub(r'[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words = []
    for word in tweet.split():
        if word.lower() not in stops:
            if word in list(abbreviations.keys()):
                words.append(abbreviations[word])
            else:
                words.append(word.lower())
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in test_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

In [17]:
del train['negativereason_gold']
del test['negativereason_gold']

In [18]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
for index, row in train.iterrows():
    row.text = deEmojify(row.text)
for index, row in test.iterrows():
    row.text = deEmojify(row.text)

In [19]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
for index, row in train.iterrows():
    words = row.text.split()
    new_words = []
    for word in words:
        if not hasNumbers(word):
            new_words.append(word)
    row.text = " ".join(new_words)
for index, row in test.iterrows():
    words = row.text.split()
    new_words = []
    for word in words:
        if not hasNumbers(word):
            new_words.append(word)
    row.text = " ".join(new_words)

In [20]:
train.head()

Unnamed: 0,airline_sentiment,airline,text
0,negative,Southwest,"scheduled morning, 2 days fact, yes..not sure..."
1,positive,Southwest,seeing workers time time going beyond love fl...
2,positive,United,"flew ord miami back great crew, service legs...."
3,negative,Southwest,that's horse radish 😤🐴 Southwest
4,negative,United,"ord delayed air force one, last sbn 8:20, 5 m..."


In [23]:
v = TfidfVectorizer(analyzer = 'word', max_features = 3150, max_df = 0.8, ngram_range = (1,1))
train_features = v.fit_transform(train.text)
test_features = v.transform(test.text)

In [24]:
clf = LogisticRegression(C = 2.1, solver='liblinear', multi_class = 'auto')
clf.fit(train_features, train['airline_sentiment'])
pred = clf.predict(test_features)
with open('predictions_twitter.csv', 'w') as f:
    for item in pred:
        f.write("%s\n" % item)