In [1]:
import numpy as np
import pandas as pd 
import os
import csv
import random
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter




In [2]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if '.csv' in filename:
            data_path = os.path.join(dirname, filename)
        else:
            stopword_path = '/kaggle/input/stopwords/stopwords/english'

In [4]:
print(stopword_path, data_path)
data = pd.read_csv(data_path, encoding='ISO-8859-1', names=["sentiment", "ids", "date", "flag", "user", "text"])
data = data[['sentiment', 'text']]
data = data.sample(frac= 0.0001, random_state= 69)
X, Y = data['text'], data['sentiment']
print(type(X))

/kaggle/input/stopwords/stopwords/english /kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv
<class 'pandas.core.series.Series'>


In [5]:
print('dataset contains ', data.shape[0], ' rows')
Y.value_counts(normalize=True)

dataset contains  160  rows


4    0.5375
0    0.4625
Name: sentiment, dtype: float64

In [6]:
with open(stopword_path) as f:
    stopwords_list= f.readlines()
    stopwords = []
    for i in range(len(stopwords_list)):
        a = stopwords_list[i]
        b = a.strip('\n').lower()
        stopwords.append(b)

extras = ['your', 'u', 'my']
stopwords.extend(extras)
print(stopwords)
print(X.head(10))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [7]:
special_characters = list(string.punctuation)

nltk.download('punkt')
stemmer = PorterStemmer()

def remove_stopwords(text):
    text_wostemming = ''
    text_words = text.lower()
    text_words = text_words.split()
    sentiment_words = [word for word in text_words if word not in stopwords]
    sentiment_text = ' '.join(sentiment_words)
    for i in sentiment_text:
        if i in special_characters:
            continue
        else:
            text_wostemming = text_wostemming + i
    words = word_tokenize(text_wostemming)
    stemmed_words = [stemmer.stem(i) for i in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

X = X.apply(remove_stopwords)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
print(data.columns)
print(data.shape)
print(X)

Index(['sentiment', 'text'], dtype='object')
(160, 2)
331760     michellebranch oh my hope videoshoot kickass then
438241                                             woke pain
364313        stayclassyson your soo lucki fair still 2 week
179622     1st ty follow back 2nd grrr dont cuz mess abl ...
1514041    bunnybridget care could eat million still look...
                                 ...                        
1358596    sbeecreat lol oddli enough dont eat much lobst...
1209105    markhoppu declar republican prop8 get 0 follow...
1185147    gmanandrizk ye love itbad busi peepsbut travel...
951098     lilyrose74 lol give happi till 830 work today ...
110018                               jamash that noth happen
Name: text, Length: 160, dtype: object


In [9]:
obj = SentimentIntensityAnalyzer()

In [10]:
for i in range(data.shape[0]):
    sentence = str(data.iloc[i]['text'])
    actual_sentiment = data.iloc[i]['sentiment']
    print(sentence)
    analysis_sent = obj.polarity_scores(sentence)
    if analysis_sent['compound'] < -0.05:
        op = 0
    elif analysis_sent['compound'] > 0.05:
        op = 4
    else:
        op = 2
    print(op, actual_sentiment) #lol even vader isn't working on the data in full efficiency

@michellebranch oh my!!!  I hope the videoshoot is kickass then! 
4 0
Woke up to pain 
0 0
@StayClassySon you're soo lucky! no fair, i still have 2 weeks 
4 0
1st- TY to those who follow back.  2nd - grrr to those who don't, cuz it messes up being able to follow others u want to when ur at 2000+ 
4 0
@BunnyBridget  who cares u could eat a million and still look gorgeous u only live once eat what u want !! 
4 4
Maita: chillin in CH's house in between shoots, look at their cool koi pond! I heart cloudy days   http://twitpic.com/50w0h
4 4
tool in Ubunutu for creating effects with windows etc....anyone tell me the name of it...can't remember 
4 0
@The_cobra666 @opinion8ed_dyke  yeah but this is insane... and apparently some people have taken an interest in my whereabouts.... 
4 0
@ba1L33 If they get it... still a few financing problems. 
0 0
wotsits are the boom diggy 
2 4
@TehKimber ~ Not *quite* so much sugar -- it was &quot;No Sugar Added&quot; sorbet, and I only had a small glass. 
2 4