In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nafise/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nafise/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nafise/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
#Preprocessing function
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()

    # Tokenization and removing all the special characters and punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [5]:
# Train and evaluate function
def train_evaluate_svm(df):
    df['message_clean'] = df['message_clean'].apply(preprocess_text)

    X = df['message_clean']
    y = df['is_positive']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Train the SVM classifier
    svm = SVC(kernel='linear')
    svm.fit(X_train, y_train)

    # Predict the sentiment labels for the test set
    y_pred = svm.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f1

In [6]:
# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/Nafisedev/nlp-model/main/data/dataset.csv')

# change the message_clean to string and drop the Nan values
df = df.astype({'message_clean':'string'})

if df['message_clean'].isnull().values.any() or df['is_positive'].isnull().values.any():
    # Drop rows where message_clean or is_positive values are null
    df = df.dropna(subset=['message_clean', 'is_positive'])

# Train and evaluate SVM classifier
accuracy, precision, recall, f1 = train_evaluate_svm(df)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.73875
Precision: 0.7457128858402744
Recall: 0.7431640625
F1 Score: 0.7444362924920519
