In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Harsh
[nltk_data]     Clean\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data = pd.read_csv('../../Warehouse/Reviews/app_reviews_merged.csv')

In [4]:
# Preprocess the data
data['content'] = data['content'].str.lower()

# Specific features to look for in reviews
features = [
    'user interface', 'account registration', 'transaction process', 'payment options', 'security', 
    'rewards program', 'customer support', 'app performance', 'transaction history', 'notifications', 
    'savings features', 'expense tracking', 'budgeting tools', 'cashback offers', 'rewards redemption', 
    'bill payments', 'money transfer', 'ease of use', 'card management', 'app updates', 
    'user feedback responsiveness', 'offers and promotions', 'offline usability', 'transaction limits', 
    'foreign currency support', 'reliability', 'data privacy', 'user onboarding', 'transaction fees', 
    'account settings', 'account linking', 'app design', 'financial education resources', 'synchronization with bank accounts'
]

In [5]:
# Create a dictionary to store user patterns
user_patterns = defaultdict(lambda: {'average_score': 0, 'feature_mentions': Counter()})

# Calculate the average score per user
user_avg_scores = data.groupby('userName')['score'].mean()

In [6]:
# Tokenize the review content and count the mentions of specific features for each user
for _, row in data.iterrows():
    user = row['userName']
    content = row['content']
    tokens = word_tokenize(content)
    
    for feature in features:
        feature_tokens = word_tokenize(feature)
        if set(feature_tokens).issubset(set(tokens)):
            user_patterns[user]['feature_mentions'][feature] += 1 #type: ignore

In [7]:
# Update the average score in user_patterns
for user, avg_score in user_avg_scores.items():
    user_patterns[user]['average_score'] = avg_score

In [8]:
# Group users based on their average scores and feature mentions
high_rating_users = [user for user, pattern in user_patterns.items() if pattern['average_score'] >= 4] #type: ignore
low_rating_users = [user for user, pattern in user_patterns.items() if pattern['average_score'] < 4] #type: ignore

In [9]:
print("High rating users:")
print(high_rating_users)
print("\nLow rating users:")
print(low_rating_users)

High rating users:

Low rating users:
['king_clicker', 'Loki', 'Nitish kumar Rai', 'Rakibul Islam', 'Tejas M P', '37-Krrish Salian', 'MD FAISAL', 'Bandari Omkar', 'BENIWAL vj', 'GM GURU', 'Harshithsai Harshithsai Patel', 'Ajay kumar Jha', 'Ashutosh Kumar', 'Shubham Gupta', 'Kartik Tyagi', 'Krishna Kashyap', 'IB GAMERS', 'Krishiv Suwarna', 'MR LoLzZz', 'Balkishan Agrawal', 'Bipin', 'Sharath Panday', 'Masood Ahmad', 'Aagam Jain', 'Since 1982', 'Akshat Agrawal', 'Tejas Pawar', 'Aman Singh', 'Yash Mishra', 'Ramit Gupta', 'Aman Kumar', 'Mr Dashrath', 'Harshita Basak', 'Tanveer Khan', 'Merajul Islam', 'pratham doshi', 'Aadi Dubey', 'lucky Singh', 'Ravi Pandit', 'Ankita Devi', 'Ashish Reddu', 'Shubh Singh Adhikari', 'Vinayak Badiger', 'kishlay prakash', 'Vipra goyal', 'Sahil', 'Devil Gaming', 'Yadav Kunal', 'Suman Kumar', 'Mayur bhalchiM', 'Vighnesh Mandwaria', 'Ashik UDDIN', 'ro se', 'Som Giri Gosami', 'Deep Mehta', 'Gunjesh Gaurav', 'Uteesh Kumar', 'Shakshi jindal', 'Nayna Shewakramani', 'A

In [10]:
# Analyze feature mentions for different user segments
high_rating_feature_mentions = Counter()
low_rating_feature_mentions = Counter()

for user in high_rating_users:
    high_rating_feature_mentions += user_patterns[user]['feature_mentions'] #type: ignore

for user in low_rating_users:
    low_rating_feature_mentions += user_patterns[user]['feature_mentions'] #type: ignore

In [11]:
print("\nFeature mentions by high rating users:")
print(high_rating_feature_mentions)
print("\nFeature mentions by low rating users:")
print(low_rating_feature_mentions)


Feature mentions by high rating users:
Counter({'money transfer': 786, 'security': 95, 'user interface': 81, 'app design': 68, 'customer support': 63, 'app updates': 62, 'payment options': 38, 'app performance': 33, 'transaction process': 13, 'bill payments': 13, 'account linking': 13, 'cashback offers': 11, 'transaction history': 9, 'notifications': 6, 'savings features': 4, 'transaction limits': 4, 'rewards program': 3, 'transaction fees': 2, 'card management': 2, 'account registration': 1, 'ease of use': 1})

Feature mentions by low rating users:
Counter({'money transfer': 624, 'customer support': 327, 'app updates': 73, 'security': 61, 'transaction process': 47, 'payment options': 34, 'app performance': 31, 'transaction history': 25, 'user interface': 22, 'notifications': 12, 'app design': 11, 'account registration': 11, 'transaction limits': 9, 'account settings': 5, 'account linking': 4, 'cashback offers': 3, 'transaction fees': 3, 'bill payments': 3, 'card management': 2, 'savi

In [12]:
# Save high rating users to a CSV file
high_rating_users_df = pd.DataFrame(high_rating_users, columns=['userName'])
high_rating_users_df.to_csv('high_rating_users.csv', index=False)

# Save low rating users to a CSV file
low_rating_users_df = pd.DataFrame(low_rating_users, columns=['userName'])
low_rating_users_df.to_csv('low_rating_users.csv', index=False)

# Save feature mentions by high rating users to a CSV file
high_rating_feature_mentions_df = pd.DataFrame.from_dict(high_rating_feature_mentions, orient='index', columns=['mentions'])
high_rating_feature_mentions_df.reset_index(level=0, inplace=True)
high_rating_feature_mentions_df.rename(columns={'index': 'feature'}, inplace=True)
high_rating_feature_mentions_df.to_csv('high_rating_feature_mentions.csv', index=False)

# Save feature mentions by low rating users to a CSV file
low_rating_feature_mentions_df = pd.DataFrame.from_dict(low_rating_feature_mentions, orient='index', columns=['mentions'])
low_rating_feature_mentions_df.reset_index(level=0, inplace=True)
low_rating_feature_mentions_df.rename(columns={'index': 'feature'}, inplace=True)
low_rating_feature_mentions_df.to_csv('low_rating_feature_mentions.csv', index=False)

In [13]:
# plot the feature mentions using plotly
import plotly.express as px
import plotly.io as pio

fig = px.bar(high_rating_feature_mentions_df, x='feature', y='mentions', title='Feature mentions by high rating users')
fig.show()
pio.write_html(fig, file='../../fam-report-site/public/App-Analytics/high_rating_feature_mentions.html', auto_open=True)

In [14]:
fig = px.bar(low_rating_feature_mentions_df, x='feature', y='mentions', title='Feature mentions by low rating users')
fig.show()
pio.write_html(fig, file='../../fam-report-site/public/App-Analytics/low_rating_feature_mentions.html', auto_open=True)

In [15]:
# Calculate the average score per user
user_avg_scores = data.groupby('userName')['score'].mean().reset_index()

# Plot the average score distribution
fig = px.histogram(user_avg_scores, x='score', title='Average Score Distribution')
fig.show()
pio.write_html(fig, file='../../fam-report-site/public/App-Analytics/average_score_distribution.html', auto_open=True)

In [16]:
comparison_df = pd.DataFrame({
    'feature': high_rating_feature_mentions_df['feature'],
    'high_rating_mentions': high_rating_feature_mentions_df['mentions'],
    'low_rating_mentions': low_rating_feature_mentions_df['mentions']
})

fig = px.bar(comparison_df, x='feature', y=['high_rating_mentions', 'low_rating_mentions'],
             barmode='group', title='Feature Mention Comparison')
fig.show()
pio.write_html(fig, file='../../fam-report-site/public/App-Analytics/feature_mention_comparison.html', auto_open=True)

In [17]:
user_scores_mentions = pd.DataFrame({
    'average_score': [pattern['average_score'] for pattern in user_patterns.values()],
    'total_mentions': [sum(pattern['feature_mentions'].values()) for pattern in user_patterns.values()]
})

fig = px.scatter(user_scores_mentions, x='total_mentions', y='average_score',
                 title='User Score vs. Total Feature Mentions')
fig.show()