<a href="https://colab.research.google.com/github/mazen200/sentiment_analysis-/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing libraries**

In [312]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,f1_score,recall_score,precision_score
import nltk
from nltk.corpus import stopwords
import string
import pickle

# **Load the dataset**

In [313]:
data = pd.read_csv("sentimentdataset.csv")

In [314]:
data.head()

Unnamed: 0,ID,Text,Sentiment (Label),Timestamp,User,Source,Topic,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,Enjoying a beautiful day at the park! ...,Positive,1/15/2023 12:30,User123,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,1,Traffic was terrible this morning. ...,Negative,1/15/2023 8:45,CommuterX,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,2,Just finished an amazing workout! 💪 ...,Positive,1/15/2023 15:45,FitnessFan,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,3,Excited about the upcoming weekend getaway! ...,Positive,1/15/2023 18:20,AdventureX,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,4,Trying out a new recipe for dinner tonight. ...,Neutral,1/15/2023 19:55,ChefCook,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19


# **Data Preprocessing**

In [315]:
from sklearn.preprocessing import LabelEncoder #encoding
def Feature_Encoder(X, cols):
    label_encoders = {}
    for c in cols:
        lbl = LabelEncoder()
        lbl.fit(X[c])
        X[c] = lbl.transform(X[c])
        label_encoders[c] = lbl
    return X, label_encoders


In [316]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Set of English stopwords
translator = str.maketrans('', '', string.punctuation)  # Translator to remove punctuation
data['Text'] = data['Text'].apply(lambda x: ' '.join([word.translate(translator) for word in x.lower().split() if word not in stop_words]))  # Remove stopwords and punctuation, and convert to lowercase
data['Sentiment (Label)'] = data['Sentiment (Label)'].apply(lambda x: x.strip()) #removes trailing and leading whitespaces
data['Topic'] = data['Topic'].apply(lambda x: x.strip()) #removes trailing and leading whitespaces
data.drop(columns=['User','Timestamp','Source','Retweets',	'Likes','ID',	'Country','Year','Month',	'Day','Hour'], inplace=True)
data = data.assign(Topic=data['Topic'].str.split('#')).explode('Topic')
data.reset_index(drop=True,inplace=True)
data = data[data['Topic']!=""]
data.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mostafa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1                      Nature 
2                         Park
4                     Traffic 
5                      Morning
7                     Fitness 
                 ...          
2189    HighSchoolPhilanthropy
2191      CulturalCelebration 
2192           HighSchoolUnity
2194     VirtualEntertainment 
2195      HighSchoolPositivity
Name: Topic, Length: 1464, dtype: object

In [317]:
cols=('Topic','Sentiment (Label)')
data, label_encoders = Feature_Encoder(data.iloc[:,:3],cols)

In [318]:
data.head(10)

Unnamed: 0,Text,Sentiment (Label),Topic
1,enjoying beautiful day park,146,615
2,enjoying beautiful day park,146,665
4,traffic terrible morning,134,943
5,traffic terrible morning,134,596
7,finished amazing workout 💪,146,312
8,finished amazing workout 💪,146,998
10,excited upcoming weekend getaway,146,949
11,excited upcoming weekend getaway,146,15
13,trying new recipe dinner tonight,135,177
14,trying new recipe dinner tonight,135,321


# **Feature Engineering**

In [319]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Using TF-IDF for feature extraction
X = tfidf_vectorizer.fit_transform(data['Text'])  # Transform text data into numerical features
y = data['Sentiment (Label)']  # Target labels

# **Splitting data into training and testing sets**

In [320]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#**Model Selection and Training**


In [321]:
svm_model = SVC(kernel='poly')  # Using Support Vector Machine (SVM)
svm_model.fit(X_train, y_train)

# **Model Evaluation**


In [322]:
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred,average ='weighted')
recall = recall_score(y_test, y_pred,average ='weighted')
f1sc = f1_score(y_test, y_pred,average ='weighted')

print("Accuracy:", accuracy)
print("precision : " , precision)
print("recall : ", recall)
print("f1 score : ",f1_score )
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8498293515358362
precision :  0.90107389588259
recall :  0.8498293515358362
f1 score :  <function f1_score at 0x71975d1c9cf0>
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         1
          15       1.00      0.33      0.50         3
          17       1.00      1.00      1.00         3
          18       1.00      1.00      1.00         1
          19       1.00      1.00      1.00         1
          21       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         1
          24       1.00      1.00      1.00         1
          25       1.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [323]:
pickle.dump(svm_model, open('trained_model.sav', 'wb'))
pickle.dump(tfidf_vectorizer, open('vectorized_model.sav', 'wb'))
pickle.dump(label_encoders, open('label_encoder.sav', 'wb'))