In [103]:
pip install nltk




In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [129]:
# Step 1: Load your custom stopwords
# Assuming your stopwords are in a file called 'stopwords.txt'
with open('stop_words_list.py', 'r') as file:
    custom_stopwords = file.read().splitlines()

In [130]:

# Step 1: Load your dataset Assuming your dataset has two columns:  (feedback) and  (label)
df = pd.read_csv('text_feedback_dataset.csv', encoding='ISO-8859-1')


In [131]:
# Step 2: Preprocess the data
# Check the dataset structure
print(df.head())

# Ensure there are no missing values
df.dropna(inplace=True)

# Split the dataset into features (X) and labels (y)
#X = df['feedback']  # Feedback text
#y = df['label']  # Sentiment labels (e.g., Positive, Negative, Neutral)

                                            feedback     label
0  I really enjoyed the lecture, it was very info...  Positive
1  The concepts were explained clearly, I learned...  Positive
2  The course is good, but some parts were a bit ...   Neutral
3  I'm not sure if I understood the topic complet...   Neutral
4  The teacher was a bit too fast, I had trouble ...  Negative


In [132]:
# Step 4: Clean the feedback text (removing special characters, numbers, etc.)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['cleaned_feedback'] = df['feedback'].apply(clean_text)

# Split the dataset into features (X) and labels (y)
X = df['cleaned_feedback']  # Cleaned feedback text
y = df['label']  # Sentiment labels (e.g., Positive, Negative, Neutral)

In [133]:
# Check class distribution
print("\nClass Distribution:")
print(df['label'].value_counts())


Class Distribution:
label
Positive    508
Neutral     506
Negative    499
Name: count, dtype: int64


In [134]:
print(df.head())

                                            feedback     label  \
0  I really enjoyed the lecture, it was very info...  Positive   
1  The concepts were explained clearly, I learned...  Positive   
2  The course is good, but some parts were a bit ...   Neutral   
3  I'm not sure if I understood the topic complet...   Neutral   
4  The teacher was a bit too fast, I had trouble ...  Negative   

                                    cleaned_feedback  
0  i really enjoyed the lecture it was very infor...  
1  the concepts were explained clearly i learned ...  
2  the course is good but some parts were a bit c...  
3   im not sure if i understood the topic completely  
4  the teacher was a bit too fast i had trouble k...  


In [135]:
# Step 5: Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words=custom_stopwords)
X_tfidf = vectorizer.fit_transform(X)



In [136]:
# Check TF-IDF features
print("\nTF-IDF Features Shape:", X_tfidf.shape)
print("First Sample's TF-IDF Features:")
print(X_tfidf[0])


TF-IDF Features Shape: (1513, 867)
First Sample's TF-IDF Features:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 867)>
  Coords	Values
  (0, 607)	0.5168231581331081
  (0, 227)	0.44166233934284804
  (0, 737)	0.1522037056810792
  (0, 438)	0.4315504198398246
  (0, 414)	0.24433906642752787
  (0, 824)	0.18982266558719493
  (0, 816)	0.32087586546517005
  (0, 385)	0.36018179235660464


In [137]:
# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [139]:
# Step 7: Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers to try
    'max_iter': [100, 200, 300]  # Iterations for convergence
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_

In [140]:
# Step 8: Evaluate the model
y_pred = best_model.predict(X_test)

In [141]:
# Evaluate performance
print("\nBest Hyperparameters:", grid_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Best Hyperparameters: {'C': 1, 'max_iter': 200, 'solver': 'saga'}

Classification Report:
               precision    recall  f1-score   support

    Negative       0.87      0.81      0.84       113
     Neutral       0.82      0.78      0.80        92
    Positive       0.85      0.95      0.90        98

    accuracy                           0.85       303
   macro avg       0.85      0.85      0.85       303
weighted avg       0.85      0.85      0.85       303

Accuracy: 0.8481848184818482


In [146]:
# Step 8: Use the trained model to predict sentiment for new feedback
def predict_sentiment(feedback, vectorizer, model):
    # Clean the feedback text (same preprocessing as before)
    feedback_cleaned = clean_text(feedback)
    
    # Transform the cleaned feedback into TF-IDF features
    feedback_tfidf = vectorizer.transform([feedback_cleaned])
    
    # Predict the sentiment
    sentiment = model.predict(feedback_tfidf)[0]
    
    return sentiment

# Example usage
new_feedback = "the course was  informative , i learnt from basic"
predicted_sentiment = predict_sentiment(new_feedback, vectorizer, best_model)
print(f"\nPredicted Sentiment: {predicted_sentiment}")



Predicted Sentiment: Neutral
