In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import pickle

# Load and preprocess data
data_excel = pd.read_excel('Custmer VOC.xlsx')

# Preprocessing function (improved with stopwords)
def preprocess_arabic_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove Arabic diacritics
    arabic_diacritics = re.compile(r'[ً-ْ]')
    text = re.sub(arabic_diacritics, '', text)
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in arabic_stopwords]
    return ' '.join(words)

# Download Arabic stopwords
nltk.download('stopwords')
arabic_stopwords = set(stopwords.words('arabic'))

# Apply preprocessing to the 'Customer Verbatim' column
data_excel['Cleaned Verbatim'] = data_excel['Customer Verbatim'].apply(preprocess_arabic_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data_excel['Cleaned Verbatim'])
y = data_excel['Rate'].apply(lambda x: 1 if x == 'Positive' else (-1 if x == 'Negative' else 0))

# Split into training, validation, and test sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train the model (SVM with class_weight='balanced' to handle class imbalance)
svm = SVC(C=1, kernel='linear', gamma='scale', class_weight='balanced')
svm.fit(X_train, y_train)

# Evaluate the model on validation set
y_val_pred = svm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Evaluate the model on test set
y_test_pred = svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Validation Accuracy: 0.7916666666666666
Test Accuracy: 0.85


In [3]:
# Save the SVM model and TF-IDF vectorizer using pickle
with open('svm_model.pkl', 'wb') as model_file:
    pickle.dump(svm, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [4]:
# Streamlit App for Sentiment Analysis
import streamlit as st

# Load the trained SVM model and vectorizer
with open('svm_model.pkl', 'rb') as model_file:
    svm_model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

st.title("Sentiment Analysis Application")

# Input box for user text
user_input = st.text_area("Enter text for sentiment analysis")

if st.button("Predict"):
    # Preprocess the user input
    cleaned_input = preprocess_arabic_text(user_input)
    vectorized_input = vectorizer.transform([cleaned_input])
    
    # Make predictions with the SVM model
    prediction_svm = svm_model.predict(vectorized_input)[0]
    
    # Map the prediction to sentiment labels
    if prediction_svm == 1:
        sentiment = "Positive"
    elif prediction_svm == -1:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    
    # Display the result
    st.write(f"Predicted Sentiment: {sentiment}")

2024-09-08 17:59:25.720 
  command:

    streamlit run d:\Anaconda3\envs\samirenv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
