In [15]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import nltk

In [16]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [17]:
data = pd.read_csv('../../Warehouse/Reviews/app_reviews_merged.csv')

In [18]:
# Preprocess the data
data['content'] = data['content'].str.lower()

# Specific features to look for in reviews
features = [
    'user interface', 'account registration', 'transaction process', 'payment options', 'security', 
    'rewards program', 'customer support', 'app performance', 'transaction history', 'notifications', 
    'savings features', 'expense tracking', 'budgeting tools', 'cashback offers', 'rewards redemption', 
    'bill payments', 'money transfer', 'ease of use', 'card management', 'app updates', 
    'user feedback responsiveness', 'offers and promotions', 'offline usability', 'transaction limits', 
    'foreign currency support', 'reliability', 'data privacy', 'user onboarding', 'transaction fees', 
    'account settings', 'account linking', 'app design', 'financial education resources', 'synchronization with bank accounts'
]

In [19]:
# Create a dictionary to store user patterns
user_patterns = defaultdict(lambda: {'average_score': 0, 'feature_mentions': Counter()})

# Calculate the average score per user
user_avg_scores = data.groupby('userName')['score'].mean()

In [20]:
# Tokenize the review content and count the mentions of specific features for each user
for _, row in data.iterrows():
    user = row['userName']
    content = row['content']
    tokens = word_tokenize(content)
    
    for feature in features:
        feature_tokens = word_tokenize(feature)
        if set(feature_tokens).issubset(set(tokens)):
            user_patterns[user]['feature_mentions'][feature] += 1 #type: ignore

In [21]:
# Update the average score in user_patterns
for user, avg_score in user_avg_scores.items():
    user_patterns[user]['average_score'] = avg_score

In [22]:
# Group users based on their average scores and feature mentions
high_rating_users = [user for user, pattern in user_patterns.items() if pattern['average_score'] >= 4] #type: ignore
low_rating_users = [user for user, pattern in user_patterns.items() if pattern['average_score'] < 4] #type: ignore

In [23]:
print("High rating users:")
print(high_rating_users)
print("\nLow rating users:")
print(low_rating_users)

High rating users:
['Nishant Chaudhary', 'Sanjeev Kumar', 'Kunal Yadav', 'Ajay Singh', 'Raj Sharma', 'Aditya Chauhan', 'Manju Pandey', 'Rakesh S', 'Gulshan Kumar', 'Priyanshu Barik', 'Karan Kumar', 'Yash Sapkal', 'V.k. Techno boy', 'Preeti Prajapati', 'Shreyansh kumar', 'JaShUvA', 'Nathanael Isaiah', 'Sourin Roy', 'Syed Zayed', 'Krish Pathare', 'Shubham Gharat', 'Roni Barman', 'Chaithu Latha', 'Tapan Nayak', 'NISCHAL SAI', 'Saud Moosa', 'Malik gaming 07', 'Mr Suraj', 'Figo Sangma', 'BĦΔṈU ツ CĦΔṈDER', 'Tapas Ghosh', 'Tãrüñ Tîwårï', 'Mukul Miglani', 'Nayana parsiya', 'DEADSHOT', 'Zuneid Aktar Hussain', 'Animesh Sinha', 'kushal joshi', 'Lokesh Gujjar', 'Vihaan Goel', 'Archana Tanwar', 'Ashok Kumar Yadav', 'BaLu -_-', 'Ayush (Ayush)', 'Karthikeya Javangula', 'Om Vyavhare', 'Pradip Upadhyay', 'Omkar Karale', 'Street Food Plus Vlog', 'Krish.H. Barot', 'Anoop Kumar', 'Wahed Ali', 'King Zaid Gaming.', '13-srishant Kundar', 'Saarthak Biswakarma', 'Shalam Md afshal', 'Saqib Raeen', 'Nakul Laddha

In [24]:
# Analyze feature mentions for different user segments
high_rating_feature_mentions = Counter()
low_rating_feature_mentions = Counter()

for user in high_rating_users:
    high_rating_feature_mentions += user_patterns[user]['feature_mentions'] #type: ignore

for user in low_rating_users:
    low_rating_feature_mentions += user_patterns[user]['feature_mentions'] #type: ignore

In [25]:
print("\nFeature mentions by high rating users:")
print(high_rating_feature_mentions)
print("\nFeature mentions by low rating users:")
print(low_rating_feature_mentions)


Feature mentions by high rating users:
Counter({'money transfer': 192, 'security': 43, 'app updates': 37, 'customer support': 25, 'payment options': 19, 'user interface': 16, 'app performance': 13, 'transaction history': 9, 'transaction process': 6, 'notifications': 6, 'cashback offers': 6, 'app design': 6, 'savings features': 3, 'transaction limits': 3, 'account linking': 2, 'card management': 2, 'rewards program': 2, 'ease of use': 1, 'account registration': 1, 'transaction fees': 1, 'bill payments': 1})

Feature mentions by low rating users:
Counter({'money transfer': 611, 'customer support': 312, 'app updates': 67, 'security': 56, 'transaction process': 47, 'payment options': 32, 'app performance': 31, 'transaction history': 22, 'user interface': 14, 'notifications': 12, 'account registration': 11, 'transaction limits': 8, 'app design': 7, 'account settings': 5, 'account linking': 4, 'cashback offers': 3, 'transaction fees': 3, 'card management': 2, 'bill payments': 2, 'savings fe

In [26]:
# Save high rating users to a CSV file
high_rating_users_df = pd.DataFrame(high_rating_users, columns=['userName'])
high_rating_users_df.to_csv('high_rating_users.csv', index=False)

# Save low rating users to a CSV file
low_rating_users_df = pd.DataFrame(low_rating_users, columns=['userName'])
low_rating_users_df.to_csv('low_rating_users.csv', index=False)

# Save feature mentions by high rating users to a CSV file
high_rating_feature_mentions_df = pd.DataFrame.from_dict(high_rating_feature_mentions, orient='index', columns=['mentions'])
high_rating_feature_mentions_df.reset_index(level=0, inplace=True)
high_rating_feature_mentions_df.rename(columns={'index': 'feature'}, inplace=True)
high_rating_feature_mentions_df.to_csv('high_rating_feature_mentions.csv', index=False)

# Save feature mentions by low rating users to a CSV file
low_rating_feature_mentions_df = pd.DataFrame.from_dict(low_rating_feature_mentions, orient='index', columns=['mentions'])
low_rating_feature_mentions_df.reset_index(level=0, inplace=True)
low_rating_feature_mentions_df.rename(columns={'index': 'feature'}, inplace=True)
low_rating_feature_mentions_df.to_csv('low_rating_feature_mentions.csv', index=False)

In [32]:
# plot the feature mentions using plotly
import plotly.express as px
import plotly.io as pio

fig = px.bar(high_rating_feature_mentions_df, x='feature', y='mentions', title='Feature mentions by high rating users')
fig.show()
pio.write_html(fig, file='high_rating_feature_mentions.html', auto_open=True)

In [33]:
fig = px.bar(low_rating_feature_mentions_df, x='feature', y='mentions', title='Feature mentions by low rating users')
fig.show()
pio.write_html(fig, file='low_rating_feature_mentions.html', auto_open=True)

In [34]:
# Calculate the average score per user
user_avg_scores = data.groupby('userName')['score'].mean().reset_index()

# Plot the average score distribution
fig = px.histogram(user_avg_scores, x='score', title='Average Score Distribution')
fig.show()
pio.write_html(fig, file='average_score_distribution.html', auto_open=True)

In [35]:
comparison_df = pd.DataFrame({
    'feature': high_rating_feature_mentions_df['feature'],
    'high_rating_mentions': high_rating_feature_mentions_df['mentions'],
    'low_rating_mentions': low_rating_feature_mentions_df['mentions']
})

fig = px.bar(comparison_df, x='feature', y=['high_rating_mentions', 'low_rating_mentions'],
             barmode='group', title='Feature Mention Comparison')
fig.show()
pio.write_html(fig, file='feature_mention_comparison.html', auto_open=True)

In [31]:
user_scores_mentions = pd.DataFrame({
    'average_score': [pattern['average_score'] for pattern in user_patterns.values()],
    'total_mentions': [sum(pattern['feature_mentions'].values()) for pattern in user_patterns.values()]
})

fig = px.scatter(user_scores_mentions, x='total_mentions', y='average_score',
                 title='User Score vs. Total Feature Mentions')
fig.show()