In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# train data

df = pd.read_csv('train.csv')

In [5]:
# test data

df_test = pd.read_csv('test.csv')

In [6]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [7]:
df_test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [8]:
df['message'].iloc[0]

"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable"

In [9]:
df['message'].iloc[1]

"It's not like we lack evidence of anthropogenic global warming"

# 1. Data Preprocessing

### Taking care of null values - Train set

In [10]:
len(df)

15819

In [11]:
df['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

##### Checking if an entry is null.

In [12]:
df.isnull().sum()

sentiment    0
message      0
tweetid      0
dtype: int64

##### Checking if a string is empty.

In [13]:
blank = []
for i, sentiment, message, tweetid in df.itertuples():
    if type(message) == 'str':
        if message.issspace():
            blank.append(i)

In [14]:
blank

[]

### Taking care of null values - Test set

##### Checking if an entry is null.

In [15]:
df_test.isnull().sum()

message    0
tweetid    0
dtype: int64

##### Checking if a string is empty.

In [16]:
blank_t = []
for i, message, tweetid in df_test.itertuples():
    if type(message) == 'str':
        if message.issspace():
            blank_t.append(i)

In [17]:
blank_t

[]

### Preprocessing the text

#### To clean up the text. I will be using a combination of regular expression to remove unwanted features, Tweettokenizer to tokenize the text and will also lemmatize the text in a single fucntion.

In [18]:
import preprocessor as p

In [19]:
# creating relevant text preprocessing instances

lem = WordNetLemmatizer()
token = TweetTokenizer()

In [21]:
# 
def cleaning_text(Data):
    tweet_list = []

    for tweet in Data:
    
        # cleaning tweet; removing URLs, mentions, Emojis, Smileys, RTs and FAV (Reserved words)
        #doc = p.clean(tweet)
    
        # converting text to lowercase
        doc = tweet.lower()
    
        # remove all punctuation and special characters from a tweet
        doc = re.sub(r'\W', ' ', doc)
    
        # remove all numbers
    
        doc = re.sub(r'\d', ' ', doc)

        # remove all singe characters after special characters have been removed
        doc = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc)
    
        # remove all single characters from the start
        doc = re.sub(r'\^[a-zA-Z]\s+', ' ', doc)
    
        # substituting multiple spaces with a single space
        doc = re.sub(r'\s+', ' ', doc)
    
        # Tokenizing and lemmatization
    
        doc = [lem.lemmatize(word) for word in token.tokenize(doc)]
    
        # joining to get the tokens back into a string
        doc = ' '.join(doc)
    
        # appending to list
    
        tweet_list.append(doc)
    
    
    return tweet_list
    
    

In [22]:
train_data = cleaning_text(df['message'])

In [23]:
train_data[:5]

['polyscimajor epa chief doesn think carbon dioxide is main cause of global warming and wait what http co yelvcefxkc via mashable',
 'it not like we lack evidence of anthropogenic global warming',
 'rt rawstory researcher say we have three year to act on climate change before it too late http co wdt kdur http co anpt',
 'todayinmaker wired wa pivotal year in the war on climate change http co wotxtlcd',
 'rt soynoviodetodas it and racist sexist climate change denying bigot is leading in the poll electionnight']

In [24]:
df['message_clean'] = train_data

In [25]:
df

Unnamed: 0,sentiment,message,tweetid,message_clean
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,polyscimajor epa chief doesn think carbon diox...
1,1,It's not like we lack evidence of anthropogeni...,126103,it not like we lack evidence of anthropogenic ...
2,2,RT @RawStory: Researchers say we have three ye...,698562,rt rawstory researcher say we have three year ...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,todayinmaker wired wa pivotal year in the war ...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,rt soynoviodetodas it and racist sexist climat...
...,...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001,rt ezlusztig they took down the material on gl...
15815,2,RT @washingtonpost: How climate change could b...,17856,rt washingtonpost how climate change could be ...
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248,notiven rt nytimesworld what doe trump actuall...
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732,rt sara smile hey liberal the climate change c...


# 2. Splitting train data into features and labels

In [26]:
x = df['message_clean']
y = df['sentiment']

# 3. Cross validation of our train data set

### Using the train data set, I perfom cross validation in building a classification model via a pipeline object and training it with the train data. The built model will then be used to predict the results of the full test data set.

## splitting the train data into train test split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

## Building pipeline object to vectorize and train the model with the cross validation train test split.

##### importing classification model(s): Note...As a first entry, I will only build one model, however, I will build several other models later on

In [38]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB





In [39]:
text_clf = Pipeline([('tfid_vectorizer', TfidfVectorizer(ngram_range = (1,2), stop_words = stopwords.words('english'))), 
                     ('L_SVC', LinearSVC(multi_class = 'crammer_singer'))])

## training the pipeline object model

In [40]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfid_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourse...
                                             'it', "it's", 'its', 'itself', ...],
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 

## predicting test set of the train test split of the cross validation.


In [41]:
y_pred = text_clf.predict(X_test)

In [42]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

In [43]:
print(f1_score(y_test, y_pred, average = 'macro'))

0.668901535980853


In [44]:
print('Classification report:\n\n', classification_report(y_test, y_pred))

Classification report:

               precision    recall  f1-score   support

          -1       0.74      0.53      0.62       401
           0       0.63      0.40      0.49       666
           1       0.80      0.83      0.81      2598
           2       0.68      0.84      0.75      1081

    accuracy                           0.75      4746
   macro avg       0.71      0.65      0.67      4746
weighted avg       0.74      0.75      0.74      4746



In [45]:
print('Confusion matrix:\n\n',pd.DataFrame(confusion_matrix(y_test, y_pred), index = [-1, 0, 1, 2], columns = [-1, 0, 1, 2]))

Confusion matrix:

      -1    0     1    2
-1  214   48   115   24
 0   38  266   276   86
 1   34   97  2161  306
 2    4    9   164  904


In [46]:
print('Accuracy:\n\n',accuracy_score(y_test, y_pred))

Accuracy:

 0.7469447956173619


# 3. Evaluation of the test set

### I will now be evaluating the full test data set by cleaning the data and then predicting the results using our train data.

### Pre processing our test set using the cleaning text function

In [None]:
test_data = cleaning_text(df_test['message'])

In [None]:
test_data[:5]

### Prediction of the test set

In [None]:
test_set_pred = text_clf.predict(test_data)

In [None]:
df_test['sentiment'] = test_set_pred

In [None]:
df_test.head()

# Submission to csv file that is to be uploaded/submitted on Kaggle.

In [169]:
df_test[['tweetid', 'sentiment']].to_csv('submission_2.csv', index = False)