In [8]:
import pandas as pd
import numpy as np
import re

In [17]:
stop_words = set(stopwords.words('english'))

In [18]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [9]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def initialize_parameters(self, n):
        self.w = np.zeros((n, 1))
        self.b = 0

    def propagate(self, X, Y):
        m = X.shape[1] 
        A = self.sigmoid(np.dot(self.w.T, X) + self.b)
        cost = -(1/m) * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))
        
        # Backward propagation
        dw = (1/m) * np.dot(X, (A - Y).T)
        db = (1/m) * np.sum(A - Y)
        
        grads = {"dw": dw, "db": db}
        
        return grads, cost

    def optimize(self, X, Y):
        costs = []

        for i in range(self.num_iterations):
            grads, cost = self.propagate(X, Y)
            dw = grads["dw"]
            db = grads["db"]
            
            # Update parameters
            self.w -= self.learning_rate * dw
            self.b -= self.learning_rate * db
            
            if i % 100 == 0:
                costs.append(cost)
                print(f"Cost after iteration {i}: {cost}")
        
        return costs

    def fit(self, X, Y):
        # Initialize parameters
        n = X.shape[0]
        self.initialize_parameters(n)
        
        # Gradient descent
        costs = self.optimize(X, Y)
        
        return self

    def predict(self, X):
        m = X.shape[1]
        Y_prediction = np.zeros((1, m))
        A = self.sigmoid(np.dot(self.w.T, X) + self.b)
        
        for i in range(A.shape[1]):
            Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
        
        return Y_prediction

In [10]:
data = pd.read_csv('labeled_data.csv')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [19]:
def data_processing(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"https\S+|www\S+http\S+", '', tweet, flags = re.MULTILINE)
    tweet = re.sub(r'\@w+|\#','', tweet)
    tweet = re.sub(r'[^\w\s]','',tweet)
    tweet = re.sub(r'ð','',tweet)
    tweet_tokens = word_tokenize(tweet)
    filtered_tweets = [w for w in tweet_tokens if not w in stop_words]
    return " ".join(filtered_tweets)

In [20]:
data.tweet = data['tweet'].apply(data_processing)

In [21]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    tweet = [lemmatizer.lemmatize(word) for word in data]
    return data

In [22]:
data['tweet'] = data['tweet'].apply(lambda x: lemmatizing(x))

In [23]:
from sklearn.model_selection import train_test_split

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer(ngram_range=(1,2)).fit(data['tweet'])

In [26]:
X = data['tweet']

In [27]:
y = data['class']

In [28]:
X = vect.transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
model = LogisticRegression(learning_rate=0.1, num_iterations=1000)


In [32]:
model.fit(X_train, y_train)


MemoryError: Unable to allocate 1.27 MiB for an array with shape (332599,) and data type int32