In [28]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

!pip install textblob
from textblob import TextBlob



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




**Importing the file into Python**

In [30]:
# Reading the "test" file
with open("C:/Users/Administrator/Documents/Future Interns/Task 1/test.ft.txt", "r", encoding="utf-8") as file1:
    data = file1.readlines()

# Creating a Dataframe for the data
df = pd.DataFrame(data, columns=["Review"]) 
print(df.head())

                                              Review
0  __label__2 Great CD: My lovely Pat has one of ...
1  __label__2 One of the best game music soundtra...
2  __label__1 Batteries died within a year ...: I...
3  __label__2 works fine, but Maha Energy is bett...
4  __label__2 Great for the non-audiophile: Revie...


**Text Processing**

In [32]:
#Cleaning the data
def clean_text(text):
    text = re.sub(r"__label__\d+", "", text)  # Remove labels
    text = text.lower().strip()  # Lowercase & remove spaces
    text = re.sub(r"[^a-z\s]", "", text)  # Remove punctuation & numbers
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

#Applying cleaning
df["Cleaned_Review"] = df["Review"].apply(clean_text)

#Dropping column "Review"
df.drop (columns=["Review"],inplace=True)

print(df.head())  # Check cleaned data


                                      Cleaned_Review
0  great cd lovely pat one great voices generatio...
1  one best game music soundtracks game didnt rea...
2  batteries died within year bought charger jul ...
3  works fine maha energy better check maha energ...
4  great nonaudiophile reviewed quite bit combo p...


**Sentiment Analysis**

In [34]:
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Applying sentiment analysis
df["Sentiment"] = df["Cleaned_Review"].apply(get_sentiment)

# Displaying results
print(df["Sentiment"].value_counts())  # Count sentiment categories
print(df.head())

Sentiment
Positive    301269
Negative     90315
Neutral       8416
Name: count, dtype: int64
                                      Cleaned_Review Sentiment
0  great cd lovely pat one great voices generatio...  Positive
1  one best game music soundtracks game didnt rea...  Positive
2  batteries died within year bought charger jul ...  Positive
3  works fine maha energy better check maha energ...  Positive
4  great nonaudiophile reviewed quite bit combo p...  Positive


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Cleaned_Review  400000 non-null  object
 1   Sentiment       400000 non-null  object
dtypes: object(2)
memory usage: 6.1+ MB


In [46]:
#Converting the text file to CSV
df.to_csv("C:/Users/Administrator/Documents/Future Interns/Task 1/Sentiment_Analysis.csv", index= False)
