## Load data from the file source

In [89]:
import pandas as pd

# Load the raw combined reviews
raw_file = "../data/raw/raw_reviews.csv"
df = pd.read_csv(raw_file)

# Show first 5 rows
df.head()

Unnamed: 0,review,rating,date,bank,source
0,This application is very important and advanta...,5,2025-11-27 20:08:20,CBE,Google Play Store
1,why didn't work this app?,1,2025-11-27 19:40:35,CBE,Google Play Store
2,The app makes our life easier. Thank you CBE!,5,2025-11-27 15:00:06,CBE,Google Play Store
3,this app very bad ðŸ‘Ž,1,2025-11-27 13:28:10,CBE,Google Play Store
4,the most advanced app. but how to stay safe?,5,2025-11-27 07:03:41,CBE,Google Play Store


## Check How many reviews per bank?

In [90]:
reviews_per_bank = df['bank'].value_counts()
print("Number of reviews per bank:")
print(reviews_per_bank)

Number of reviews per bank:
bank
CBE       1000
BOA       1000
Dashen     763
Name: count, dtype: int64


## check for duplicated for each bank

In [91]:
duplicate_reviews_per_bank = df.groupby('bank')['review'].apply(lambda x: x.duplicated().sum())
print("Number of duplicate reviews per bank:")
print(duplicate_reviews_per_bank)

Number of duplicate reviews per bank:
bank
BOA       145
CBE       227
Dashen     99
Name: review, dtype: int64


## check for missing data for each bank

In [92]:
missing_data_per_bank = df.groupby('bank').apply(lambda x: x.isnull().sum())
print("Missing data for each bank:")
print(missing_data_per_bank)

Missing data for each bank:
        review  rating  date  bank  source
bank                                      
BOA          0       0     0     0       0
CBE          0       0     0     0       0
Dashen       0       0     0     0       0


  missing_data_per_bank = df.groupby('bank').apply(lambda x: x.isnull().sum())


## removing duplicated reviews

In [93]:
df_cleaned_reviews_no_duplicates = df.drop_duplicates(subset=['review'])
print(f"Number of reviews after removing duplicates: {len(df_cleaned_reviews_no_duplicates)}")

Number of reviews after removing duplicates: 2211


In [95]:
reviews_per_bank_no_duplicates = df_cleaned_reviews_no_duplicates['bank'].value_counts()
print("\nNumber of reviews per bank after removing duplicates:")
print(reviews_per_bank_no_duplicates)


Number of reviews per bank after removing duplicates:
bank
BOA       820
CBE       773
Dashen    618
Name: count, dtype: int64


## Normalizing date

In [101]:
df_cleaned_reviews_no_duplicates['date'] = pd.to_datetime(df_cleaned_reviews_no_duplicates['date'])
df_cleaned_reviews_no_duplicates['date'] = df_cleaned_reviews_no_duplicates['date'].dt.strftime('%Y-%m-%d')
display(df_cleaned_reviews_no_duplicates.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_reviews_no_duplicates['date'] = pd.to_datetime(df_cleaned_reviews_no_duplicates['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_reviews_no_duplicates['date'] = df_cleaned_reviews_no_duplicates['date'].dt.strftime('%Y-%m-%d')


Unnamed: 0,review,rating,date,bank,source
0,This application is very important and advanta...,5,2025-11-27,CBE,Google Play Store
1,why didn't work this app?,1,2025-11-27,CBE,Google Play Store
2,The app makes our life easier. Thank you CBE!,5,2025-11-27,CBE,Google Play Store
3,this app very bad ðŸ‘Ž,1,2025-11-27,CBE,Google Play Store
4,the most advanced app. but how to stay safe?,5,2025-11-27,CBE,Google Play Store


## saved cleaned file 

In [102]:
output_df = df_cleaned_reviews_no_duplicates[['review', 'rating', 'date', 'bank', 'source']]
output_file_path = '../data/processed/cleaned_reviews_final.csv'
output_df.to_csv(output_file_path, index=False)
print(f"DataFrame saved to {output_file_path}")

DataFrame saved to ../data/processed/cleaned_reviews_final.csv
