In [None]:
#Importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from math import * # module math
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Read and Load Dataset
yelp_data = pd.read_csv('/content/combined_reviews_businesses.zip', sep=',', header=0)
yelp_data = yelp_data.rename(columns={"stars_y": "float_stars", "stars_x": "stars"})
yelp_data_filtered = yelp_data[["review_id","text", "stars"]]


In [None]:
yelp_data_filtered.head()

Unnamed: 0,review_id,text,stars
0,KU_O5udG6zpxOg-VcAEodg,"If you decide to eat here, just be aware it is...",3
1,saUsX_uimxRlCVr67Z4Jig,Family diner. Had the buffet. Eclectic assortm...,3
2,AqPFMleE6RsU23_auESxiA,"Wow! Yummy, different, delicious. Our favo...",5
3,Sx8TMOWLNuJBWer-0pcmoA,Cute interior and owner (?) gave us tour of up...,4
4,JrIxlS1TzJ-iCu79ul40cQ,I am a long term frequent customer of this est...,1


In [None]:
#Create sentiment column from star ratings. Ratings 4 and 5 represent positive sentiment, Ratings 1 and 2 represent negative sentiment and Ratings 3 represents neutral sentiment.
yelp_data_filtered.loc[yelp_data_filtered['stars'] == 3, 'sentiment'] = 'neutral'
yelp_data_filtered.loc[yelp_data_filtered['stars'] < 3, 'sentiment'] = 'negative'
yelp_data_filtered.loc[yelp_data_filtered['stars'] > 3, 'sentiment'] = 'positive'

yelp_data_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yelp_data_filtered.loc[yelp_data_filtered['stars'] == 3, 'sentiment'] = 'neutral'


Unnamed: 0,review_id,text,stars,sentiment
0,KU_O5udG6zpxOg-VcAEodg,"If you decide to eat here, just be aware it is...",3,neutral
1,saUsX_uimxRlCVr67Z4Jig,Family diner. Had the buffet. Eclectic assortm...,3,neutral
2,AqPFMleE6RsU23_auESxiA,"Wow! Yummy, different, delicious. Our favo...",5,positive
3,Sx8TMOWLNuJBWer-0pcmoA,Cute interior and owner (?) gave us tour of up...,4,positive
4,JrIxlS1TzJ-iCu79ul40cQ,I am a long term frequent customer of this est...,1,negative


In [5]:
#Language detection. Delete all reviews in non-english language
#install the library
!pip install langdetect

#import detect function from langdetect
from langdetect import detect, LangDetectException

non_en = []  # Store non-English or undetectable reviews
en_reviews = []  # Store English reviews

for index, row in yelp_data_filtered.iterrows():
    try:
        lang = detect(row['text'])
        if lang != 'en':
            non_en.append(row)
        else:
            en_reviews.append(row)
    except LangDetectException:
        non_en.append(row)  # Collect undetectable reviews

# Create DataFrames
non_en_df = pd.DataFrame(non_en)
en_reviews_df = pd.DataFrame(en_reviews)

# Now you have two DataFrames:
# - en_reviews_df: Contains reviews detected as English
# - non_en_df: Contains reviews in other languages or those that couldn't be detected

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/981.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=89fe9a649b0469f5fb7fb979bededc61703d18376cebb94915e56a89acb06f13
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711

In [8]:
non_en_df.head()


Unnamed: 0,review_id,text,stars,sentiment
594,5TWmP6zi4aWQygfOC6b9wg,"De vacaciones y sin ganas de cocinar, decidimo...",5,positive
2360,daLSvHIhA3kJOwBC8Ojp-w,Op suggestie van yelp- hier geweest op onze ee...,5,positive
3005,JI_89Ib6UMPS0Sb3-jyshA,Excelente comida mediterránea! El gyro es buen...,5,positive
4034,oWzz7Vug41O4_Q_VxNbEgg,Peter piper on roids. Good pizza,4,positive
6574,lnCQF7yf3eRZgxTjBoFTAQ,点了四个菜 没一个好吃的 不适合中国人吃\n\n口水鸡鸡肉不新鲜\n毛血旺的大肠好油\n酸豆...,2,negative


In [12]:
len(non_en_df)

5056

In [10]:
len(en_reviews_df)

4719628

In [11]:
# prompt: save en_reviews_df as a csv file

en_reviews_df.to_csv('en_reviews.csv', index=False)