# Sentiment Analysis: Machine Learning Approach

### Labeling:

In [127]:
import pandas as pd

def review_labeling(data):
    df = pd.read_csv(data, header=None)
    
    # Apply the condition only to the second column (index 1)
    df[1] = df[1].apply(lambda x: 'positive' if x >= 4 else 'neutral' if 3 <= x < 4 else 'negative')
    
    return df

df_processed = review_labeling('Data/ML/reviews_ML_lightweight.csv')
print(df_processed)

                                                      0         1
0     Great music service, the audio is high quality...  positive
1     Please ignore previous negative rating. This a...  positive
2     This pop-up "Get the best Spotify experience o...  positive
3       Really buggy and terrible to use as of recently  negative
4     Dear Spotify why do I get songs that I didn't ...  negative
...                                                 ...       ...
9995  Please make sure that the panels don't lag on ...   neutral
9996  This app has become a problem, why do you alwa...  negative
9997  The app is good, but the lyric won't show for ...  positive
9998  Good App..Doesn't have ads between songs and h...  positive
9999  Impossible to use on lock screen anymore. Trie...  negative

[10000 rows x 2 columns]


### Preprocessing:

In [128]:
import numpy as np
import pandas as pd
import spacy
import emoji
import re
import nltk
from tqdm import tqdm


In [129]:
nltk.download('stopwords')
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Function to map emojis to text. E.g., "Python is 👍" is transformed to "Python is :thumbs_up:"
def map_emojis(text):
    """Convert emojis to their text representations."""
    text = emoji.demojize(text, delimiters=(" ", " "))
    # Replace underscores with spaces in emoji descriptions to avoid them being omitted during tokenization.
    return text.replace('_', ' ')

# Preprocessing function
def preprocess_text(text):
    """Preprocess a single text string."""
    # Lowercase
    text = text.lower()

    # Convert emojis to text
    text = map_emojis(text)

    # Remove URLs and emails
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # Tokenization and Lemmatization using spaCy
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_punct and not token.is_stop
    ]

    return ' '.join(tokens)

tqdm.pandas(desc="Preprocessing Reviews")
df_processed[0] = df_processed[0].progress_apply(preprocess_text)

print(df_processed)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Philipp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Philipp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Preprocessing Reviews: 100%|██████████| 10000/10000 [02:06<00:00, 78.75it/s]

                                                      0         1
0     great music service audio high quality app eas...  positive
1     ignore previous negative rating app super grea...  positive
2     pop good spotify experience android 12 annoyin...  positive
3                           buggy terrible use recently  negative
4               dear spotify song playlist shuffle play  negative
...                                                 ...       ...
9995  sure panel lag low internet connection wait ti...   neutral
9996  app problem download song time buffer time cru...  negative
9997                       app good lyric will nun song  positive
9998  good app doesn't ad song wide variety let expl...  positive
9999  impossible use lock screen anymore try find sa...  negative

[10000 rows x 2 columns]





### Train/Test Splitting:

In [130]:
from sklearn import model_selection as ms

df_text = df_processed[0]
df_target = df_processed[1]

train_data, test_data, train_target, test_target = ms.train_test_split(df_text, df_target, test_size=0.2, random_state=69, stratify=df_target)

print(test_target)


520     negative
5887    positive
9153    positive
2397    negative
8081    positive
          ...   
4190    negative
8920    negative
8251    positive
5728    negative
795     positive
Name: 1, Length: 2000, dtype: object


### Count Vectorizer:

In [131]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
train_features = cv.fit_transform(train_data)

print(train_features.shape)

#print(train_features[:3])

(8000, 6261)


In [132]:
from sklearn.preprocessing import Binarizer

transformer = Binarizer()
train_bin = transformer.fit_transform(train_features)
print(train_bin.shape)
print(train_bin[0])

(8000, 6261)
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6 stored elements and shape (1, 6261)>
  Coords	Values
  (0, 4962)	1
  (0, 4029)	1
  (0, 4370)	1
  (0, 1194)	1
  (0, 5741)	1
  (0, 4378)	1


## Classification Process

In [133]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer

text_clf = Pipeline([('vect', CountVectorizer(min_df=10, max_df=0.5)), 
                     ('binarizer', Binarizer()),
                     ('clf', MultinomialNB()) # Classificiation
                    ])

text_clf.fit(train_data, train_target) 
print(text_clf)
predicted = text_clf.predict(test_data)
print(predicted)

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5, min_df=10)),
                ('binarizer', Binarizer()), ('clf', MultinomialNB())])
['negative' 'positive' 'positive' ... 'positive' 'negative' 'positive']
