In [1]:
import numpy as np
import pandas as pd 
import os
import csv
import random
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter



In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if '.csv' in filename:
            data_path = os.path.join(dirname, filename)
        else:
            stopword_path = '/kaggle/input/d/nltkdata/stopwords/stopwords/english'

In [3]:
print(stopword_path, data_path)
data = pd.read_csv(data_path, encoding='ISO-8859-1', names=["sentiment", "ids", "date", "flag", "user", "text"])
data = data[['sentiment', 'text']]
data = data.sample(frac= 0.01, random_state= 69)
X, Y = data['text'], data['sentiment']
print(data.head(10))

/kaggle/input/d/nltkdata/stopwords/stopwords/english /kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv
         sentiment                                               text
331760           0  @michellebranch oh my!!!  I hope the videoshoo...
438241           0                                   Woke up to pain 
364313           0  @StayClassySon you're soo lucky! no fair, i st...
179622           0  1st- TY to those who follow back.  2nd - grrr ...
1514041          4  @BunnyBridget  who cares u could eat a million...
932925           4  Maita: chillin in CH's house in between shoots...
730618           0  tool in Ubunutu for creating effects with wind...
468854           0  @The_cobra666 @opinion8ed_dyke  yeah but this ...
225427           0  @ba1L33 If they get it... still a few financin...
1537939          4                        wotsits are the boom diggy 


In [4]:
print(f'dataset contains {data.shape[0]} rows')
Y.value_counts(normalize=True)

dataset contains 16000 rows


4    0.505563
0    0.494437
Name: sentiment, dtype: float64

In [5]:
with open(stopword_path) as f:
    stopwords_list= f.readlines()
    stopwords = []
    for i in range(len(stopwords_list)):
        a = stopwords_list[i]
        b = a.strip('\n').lower()
        stopwords.append(b)

extras = ['your', 'u', 'my']
stopwords.extend(extras)
print(X.head(10))

331760     @michellebranch oh my!!!  I hope the videoshoo...
438241                                      Woke up to pain 
364313     @StayClassySon you're soo lucky! no fair, i st...
179622     1st- TY to those who follow back.  2nd - grrr ...
1514041    @BunnyBridget  who cares u could eat a million...
932925     Maita: chillin in CH's house in between shoots...
730618     tool in Ubunutu for creating effects with wind...
468854     @The_cobra666 @opinion8ed_dyke  yeah but this ...
225427     @ba1L33 If they get it... still a few financin...
1537939                          wotsits are the boom diggy 
Name: text, dtype: object


In [6]:
special_characters = list(string.punctuation)

nltk.download('punkt')
stemmer = PorterStemmer()

def remove_stopwords(text):
    text_wostemming = ''
    text_words = text.lower()
    text_words = text_words.split()
    sentiment_words = [word for word in text_words if word not in stopwords]
    sentiment_text = ' '.join(sentiment_words)
    for i in sentiment_text:
        if i in special_characters:
            continue
        else:
            text_wostemming = text_wostemming + i
    words = word_tokenize(text_wostemming)
    stemmed_words = [stemmer.stem(i) for i in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

X = X.apply(remove_stopwords)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
print(data.columns)
print(data.shape)
print(X)

Index(['sentiment', 'text'], dtype='object')
(16000, 2)
331760     michellebranch oh my hope videoshoot kickass then
438241                                             woke pain
364313        stayclassyson your soo lucki fair still 2 week
179622     1st ty follow back 2nd grrr dont cuz mess abl ...
1514041    bunnybridget care could eat million still look...
                                 ...                        
609354       mvanduyn idea jb come germani oh btwyour pretti
1054238                               half blind haha jealou
1460594    eteplil saw blog ashleenew design tooha good w...
866815                                               chillin
358082                               say wha woke nose bleed
Name: text, Length: 16000, dtype: object


In [8]:
tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(X) #Term Frequency-Inverse Document Frequency; TF = (Number of occurrences of a term in a document) / (Total number of terms in the document); IDF = log((Total number of documents) / (Number of documents containing the term)); TF-IDF = TF * IDF

In [9]:
vocabulary = tfidf_vectorizer.get_feature_names_out()

print(vocabulary.shape)

(24548,)


In [10]:
print(X.shape, Y)

(16000, 24548) 331760     0
438241     0
364313     0
179622     0
1514041    4
          ..
609354     0
1054238    4
1460594    4
866815     4
358082     0
Name: sentiment, Length: 16000, dtype: int64


In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(14400, 24548) (14400,) (1600, 24548) (1600,)


In [12]:
class LogisticRegressionScratch:
    def __init__(self, alpha, iterations):
        self.learning_rate = alpha
        self.iterations = iterations
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z)) # 1/1 + e^-x
    
    def fit(self, x, y):
        m, n = x.shape
        self.w = np.zeros(n)
        self.b = 0
        
        for iteration in range(self.iterations):
            z = np.dot(x, self.w) + self.b
            h = self.sigmoid(z)
            grad_w = np.dot(x.T, (h - y)) / m
            grad_b = np.sum(h - y) / m
            self.w -= self.learning_rate * grad_w
            self.b -= self.learning_rate * grad_b
            cost = (-1/m)*np.sum( y*np.log(h) + (1-y)*np.log(1-h))
            if iteration%50 == 0:
                print("Epoch : ", iteration, "cost function : ", cost)
    
    def predict(self, x):
        z = np.dot(x, self.w) + self.b
        h = self.sigmoid(z)
        predictions = np.where(h >= 0.5, 4, 0)
        return predictions

lr = LogisticRegressionScratch(alpha=0.001, iterations=300)
lr.fit(x_train.toarray(), y_train)

y_pred = lr.predict(x_test.toarray())

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Epoch :  0 cost function :  0.6931471805599454
Epoch :  50 cost function :  0.5781765976742017
Epoch :  100 cost function :  0.46605310710978265
Epoch :  150 cost function :  0.3566945737371928
Epoch :  200 cost function :  0.2500143775991376
Epoch :  250 cost function :  0.1459224632807901
Accuracy: 0.5075
