In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('Data\TweetSentiment.csv')

In [3]:
X = np.array(data['cleaned_tweets'].values)
y = np.array(data['sentiment'].values)

In [4]:
X

array(['video offic mind busi david solomon tell gs intern learn wa',
       'price lumber lb f sinc hit ytd high maci turnaround still happen',
       'say american dream dead', ...,
       'rt hd nuff said tel telcoin telfam crypto blockchain ethereum bitcoin btc eth',
       'btc', 'stellar xlm price binanc registr open limit time'],
      dtype=object)

In [5]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    
    document = re.sub(r'\W', ' ', str(X[sen]))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'^b\s+', '', document)
    document = document.lower()
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [6]:
documents

['video offic mind busi david solomon tell g intern learn wa',
 'price lumber lb sinc hit ytd high maci turnaround still happen',
 'say american dream dead',
 'barri silbert extrem optimist bitcoin predict new crypto entrant go zero',
 'satellit avoid attack space junk circl earth paid',
 'david butler favorit fang stock realmoneysod alphabet facebook',
 'miss convo one favorit thinker',
 'u intellig document nelson mandela made public',
 'senat want emerg alert go netflix spotifi etc',
 'hedg fund manag marc larsi say bitcoin possibl',
 'u propos expedit appeal fight amp time warner purchas',
 'roger feder uniqlo deal make one athlet earn endors',
 'bond trader ahead jerom powel come inflat expect via',
 'alcoa cut adjust ebitda forecast cite tariff share slide',
 'custom urg boycott mgm resort casino file lawsuit mass shoot victim',
 'gap tighten race trillion dollar valuat amazon hit billion via',
 'presid trump endors brian kemp casey cagl georgia governor race',
 'white hous strug

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()

In [9]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
y_pred = classifier.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 410   86   14]
 [  16 3427   44]
 [   5   83 1603]]
              precision    recall  f1-score   support

          -1       0.95      0.80      0.87       510
           0       0.95      0.98      0.97      3487
           1       0.97      0.95      0.96      1691

    accuracy                           0.96      5688
   macro avg       0.96      0.91      0.93      5688
weighted avg       0.96      0.96      0.96      5688

0.9563994374120957
