In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import urllib
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




True

In [2]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test (4).csv
Saving train.csv to train (4).csv


In [3]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [4]:
train_copy = train.copy()

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('@[^\s]+', 'AT_USER', text)
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

In [7]:
train_copy['clean'] = train['message']

In [8]:
def tokenize_column_data(df, column_name):
  tweet_tokenizer = TweetTokenizer()
  tweet_tokens = []
  for index, value in train_copy[column_name].items():
      tweet_tokens.append(tweet_tokenizer.tokenize(value))
  df['tokenized'] = np.array(tweet_tokens)
  df['tokenized'].apply(lambda x: [item for item in x if item not in stopwords.words('english')])
  return df

In [9]:
train_copy = tokenize_column_data(train, 'clean')

  


In [10]:
train_copy['nostopwords'] = train_copy['tokenized']

In [11]:
lemma_list = []
for index, value in train_copy['nostopwords'].items():
  lemmatizer = WordNetLemmatizer()
  lemma_tokens = [lemmatizer.lemmatize(w) for w in value]
  lemma_list.append(lemma_tokens)

In [12]:
train_copy['lemma'] = train_copy['nostopwords']

In [13]:
concat_list = []
for index, value in train_copy['nostopwords'].items():
  concat = " ".join(value)
  concat_list.append(concat)

In [14]:
train_copy['concatenate'] = np.array(concat_list)

In [15]:
train_copy.head()

Unnamed: 0,sentiment,message,tweetid,tokenized,nostopwords,lemma,concatenate
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...",PolySciMajor EPA chief doesn't think carbon di...
1,1,It's not like we lack evidence of anthropogeni...,126103,"[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...",It's not like we lack evidence of anthropogeni...
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, :, Researchers, say, we, have,...",RT @RawStory : Researchers say we have three y...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...",#TodayinMaker # WIRED : 2016 was a pivotal yea...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","RT @SoyNovioDeTodas : It's 2016 , and a racist..."


In [16]:
y = train_copy['sentiment']
X = train_copy['message']

In [17]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

In [18]:
train_copy.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [19]:
smote = SMOTE(sampling_strategy="not majority")
X_sm, y_sm = smote.fit_sample(X_vectorized, y)



In [30]:
X_train,X_val,y_train,y_val = train_test_split(X_sm,y_sm,test_size=0.3, random_state=11)

In [31]:
lsvc = LinearSVC(C=10, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0)
lsvc.fit(X_train, y_train)
lsvc_pred = lsvc.predict(X_val)

In [32]:
f1_score(y_val, lsvc_pred, average="macro")

0.9464454383664399

In [33]:
testx = test['message']
test_vect = vectorizer.transform(testx)

In [34]:
y_pred = lsvc.predict(test_vect)

In [35]:
test['sentiment'] = y_pred

In [37]:
test[['tweetid','sentiment']].to_csv('submission.csv', index=False)