In [37]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# train data

df = pd.read_csv('train.csv')

In [39]:
# test data

df_test = pd.read_csv('test.csv')

In [40]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [41]:
df_test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [28]:
df['message'].iloc[0]

"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable"

In [29]:
df['message'].iloc[1]

"It's not like we lack evidence of anthropogenic global warming"

# 1. Data Preprocessing

### Taking care of null values - Train set

In [30]:
len(df)

15819

In [31]:
df['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

##### Checking if an entry is null.

In [32]:
df.isnull().sum()

sentiment    0
message      0
tweetid      0
dtype: int64

##### Checking if a string is empty.

In [33]:
blank = []
for i, sentiment, message, tweetid in df.itertuples():
    if type(message) == 'str':
        if message.issspace():
            blank.append(i)

In [34]:
blank

[]

### Taking care of null values - Test set

##### Checking if an entry is null.

In [35]:
df_test.isnull().sum()

message    0
tweetid    0
dtype: int64

##### Checking if a string is empty.

In [42]:
blank_t = []
for i, message, tweetid in df_test.itertuples():
    if type(message) == 'str':
        if message.issspace():
            blank_t.append(i)

In [43]:
blank_t

[]

### Preprocessing the text

#### To clean up the text. I will be using a combination of regular expression to remove unwanted features, Tweettokenizer to tokenize the text and will also lemmatize the text in a single fucntion.

In [44]:
import preprocessor as p

In [45]:
# creating relevant text preprocessing instances

lem = WordNetLemmatizer()
token = TweetTokenizer()

In [59]:
# 
def cleaning_text(Data):
    tweet_list = []

    for tweet in Data:
    
        # cleaning tweet; removing URLs, mentions, Emojis, Smileys, RTs and FAV (Reserved words)
        doc = p.clean(tweet)
    
    # converting text to lowercase
        doc = doc.lower()
    
        # remove all punctuation and special characters from a tweet
        doc = re.sub(r'\W', ' ', doc)
    
        # remove all numbers
    
        doc = re.sub(r'\d', ' ', doc)

        # remove all singe characters after special characters have been removed
        doc = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc)
    
        # remove all single characters from the start
        doc = re.sub(r'\^[a-zA-Z]\s+', ' ', doc)
    
        # substituting multiple spaces with a single space
        doc = re.sub(r'\s+', ' ', doc)
    
        # Tokenizing and lemmatization
    
        doc = [lem.lemmatize(word) for word in token.tokenize(doc)]
    
        # joining to get the tokens back into a string
        doc = ' '.join(doc)
    
        # appending to list
    
        tweet_list.append(doc)
    
    
    return tweet_list
    
    

In [60]:
train_data = cleaning_text(df['message'])

In [61]:
train_data[:5]

['polyscimajor epa chief doesn think carbon dioxide is main cause of global warming and wait what via',
 'it not like we lack evidence of anthropogenic global warming',
 'researcher say we have three year to act on climate change before it too late',
 'wired wa pivotal year in the war on climate change',
 'it and racist sexist climate change denying bigot is leading in the poll']

In [63]:
df['message_clean'] = train_data

In [64]:
df

Unnamed: 0,sentiment,message,tweetid,message_clean
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,polyscimajor epa chief doesn think carbon diox...
1,1,It's not like we lack evidence of anthropogeni...,126103,it not like we lack evidence of anthropogenic ...
2,2,RT @RawStory: Researchers say we have three ye...,698562,researcher say we have three year to act on cl...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,wired wa pivotal year in the war on climate ch...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,it and racist sexist climate change denying bi...
...,...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001,they took down the material on global warming ...
15815,2,RT @washingtonpost: How climate change could b...,17856,how climate change could be breaking up millio...
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248,notiven rt nytimesworld what doe trump actuall...
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732,hey liberal the climate change crap is hoax th...


# 2. Splitting train data into features and labels

In [65]:
x = df['message_clean']
y = df['sentiment']

# 3. Cross validation of our train data set

### Using the train data set, I perfom cross validation in building a classification model via a pipeline object and training it with the train data. The built model will then be used to predict the results of the full test data set.

## splitting the train data into train test split

In [91]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.6, random_state = 42)

## Building pipeline object to vectorize and train the model with the cross validation train test split.

##### importing classification model(s): Note...As a first entry, I will only build one model, however, I will build several other models later on

In [155]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB





In [156]:
text_clf = Pipeline([('tfid_vectorizer', TfidfVectorizer(ngram_range = (1,2), stop_words = stopwords.words('english'))), 
                     ('L_SVC', LinearSVC())])

## training the pipeline object model

In [157]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfid_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourse...
                                             'it', "it's", 'its', 'itself', ...],
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 

## predicting test set of the train test split of the cross validation.


In [158]:
y_pred = text_clf.predict(X_test)

In [159]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

In [160]:
print(f1_score(y_test, y_pred, average = 'macro'))

0.6047970153667253


In [161]:
print('Classification report:\n\n', classification_report(y_test, y_pred))

Classification report:

               precision    recall  f1-score   support

          -1       0.71      0.36      0.47       791
           0       0.56      0.40      0.47      1386
           1       0.73      0.85      0.79      5164
           2       0.69      0.69      0.69      2151

    accuracy                           0.71      9492
   macro avg       0.67      0.57      0.60      9492
weighted avg       0.70      0.71      0.69      9492



In [162]:
print('Confusion matrix:\n\n',pd.DataFrame(confusion_matrix(y_test, y_pred), index = [-1, 0, 1, 2], columns = [-1, 0, 1, 2]))

Confusion matrix:

      -1    0     1     2
-1  283  128   328    52
 0   39  551   671   125
 1   60  238  4384   482
 2   19   59   591  1482


In [163]:
print('Accuracy:\n\n',accuracy_score(y_test, y_pred))

Accuracy:

 0.7058575642646439


# 3. Evaluation of the test set

### I will now be evaluating the full test data set by cleaning the data and then predicting the results using our train data.

### Pre processing our test set using the cleaning text function

In [164]:
test_data = cleaning_text(df_test['message'])

In [165]:
test_data[:5]

['europe will now be looking to china to make sure that it is not alone in fighting climate change',
 'combine this with the polling of staffer re climate change and woman right and you have fascist state',
 'the scary unimpeachable evidence that climate change is already here',
 'putin got to you too jill trump doesn believe in climate change at all think it s hoax',
 'female orgasm cause global warming sarcastic republican']

### Prediction of the test set

In [166]:
test_set_pred = text_clf.predict(test_data)

In [167]:
df_test['sentiment'] = test_set_pred

In [168]:
df_test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


# Submission to csv file that is to be uploaded/submitted on Kaggle.

In [169]:
df_test[['tweetid', 'sentiment']].to_csv('submission_2.csv', index = False)