# IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

# IMPORT DATA SET

In [2]:
sample_submissions = pd.read_csv("https://raw.githubusercontent.com/Rirhandzu95/predict3/master/sample_submission.csv")
train_df = pd.read_csv("https://raw.githubusercontent.com/Rirhandzu95/predict3/master/train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/Rirhandzu95/predict3/master/test.csv")

In [3]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
test_df.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


# CHECK MISSING VALUES

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [6]:
train_df.isnull().sum()

sentiment    0
message      0
tweetid      0
dtype: int64

In [7]:
len(train_df)

15819

# Check for whitespace strings

In [8]:
df = train_df.copy()
blanks = []
#(index, sentiment, message, tweetid)
for i,se,msg,tid in df.itertuples():
    if msg.isspace():
        blanks.append(i)

In [9]:
blanks

[]

# check for unique values 

In [10]:
df['sentiment'].unique()
df['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

# Split the data into train & test sets

In [11]:
X = df['message']  
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scikit-learn's CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

count_vect.fit(X_train)
X_train_counts = count_vect.transform(X_train)
X_train_counts.shape

(12655, 27173)

# Transform Counts to Frequencies with Tf-idf

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


(12655, 27173)

# Combine Steps with TfidVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(12655, 27173)

# Train a Classifier

In [9]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC()

# Build a Pipeline

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# Test the classifier and display results

In [11]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [12]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 135   35   95   13]
 [  18  183  192   32]
 [  29   77 1496  153]
 [   7   10  118  571]]


In [13]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.71      0.49      0.58       278
           0       0.60      0.43      0.50       425
           1       0.79      0.85      0.82      1755
           2       0.74      0.81      0.77       706

    accuracy                           0.75      3164
   macro avg       0.71      0.64      0.67      3164
weighted avg       0.75      0.75      0.74      3164



In [14]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7537926675094817


In [15]:
y_pred= pd.DataFrame(predictions).astype(int)
predictive_model = pd.DataFrame()
predictive_model['tweetid'] = test_df['tweetid']
predictive_model['sentiment'] = y_pred
sample_submissions['sentiment'].value_counts()

1    10546
Name: sentiment, dtype: int64

In [16]:
predictive_model.head()

Unnamed: 0,tweetid,sentiment
0,169760,1.0
1,35326,2.0
2,224985,1.0
3,476263,-1.0
4,872928,0.0


In [17]:
predictive_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweetid    10546 non-null  int64  
 1   sentiment  3164 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 164.9 KB


In [18]:
predictive_model.to_csv('Classification_predict.csv')