In [38]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [39]:
stop_words = set(stopwords.words('english'))

In [40]:
data = pd.read_csv('labeled_data.csv', on_bad_lines='skip')


In [41]:
def data_processing(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"https\S+|www\S+http\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'[^\\w\\s]', '', tweet)
    tweet = re.sub(r'รฐ', '', tweet)
    tweet_tokens = word_tokenize(tweet)
    stop_words = set(stopwords.words('english'))
    filtered_tweets = [w for w in tweet_tokens if not w in stop_words]
    return " ".join(filtered_tweets)


In [42]:
data['tweet'] = data['tweet'].apply(data_processing)


In [43]:
data = data.drop_duplicates('tweet')


In [44]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    return " ".join([lemmatizer.lemmatize(word) for word in data.split()])

In [45]:
data['tweet'] = data['tweet'].apply(lemmatizing)


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [47]:
vect = TfidfVectorizer(max_features=5000)
X = vect.fit_transform(data['tweet']).toarray()
y = data['class']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.W = np.zeros(self.n)
        self.b = 0
        self.X = X
        self.y = y

        # Gradient Descent
        for _ in range(self.iterations):
            self.update_weights()

    def update_weights(self):
        y_pred = self.sigmoid(np.dot(self.X, self.W) + self.b)
        # Calculate gradients
        dW = (1 / self.m) * np.dot(self.X.T, (y_pred - self.y))
        db = (1 / self.m) * np.sum(y_pred - self.y)
        # Update weights
        self.W -= self.learning_rate * dW
        self.b -= self.learning_rate * db

    def predict(self, X):
        y_pred = self.sigmoid(np.dot(X, self.W) + self.b)
        y_pred_class = [1 if i > 0.5 else 0 for i in y_pred]
        return np.array(y_pred_class)


In [50]:
model = LogisticRegression(learning_rate=0.01, iterations=1000)

In [51]:
model.fit(X_train, y_train)

In [54]:
y_pred = model.predict(X_test)

In [55]:
accuracy = accuracy_score(y_test, y_pred)

In [58]:
print(f'{accuracy*100}%')

61.51603498542274%
