In [1]:
import pymongo
import pandas as pd
import os
from bson.objectid import ObjectId

mongo_connection_string = os.getenv("MONGO_CONNECTION_STRING")

mongo_uri = mongo_connection_string
database_name = "lingwing"
collection_name = "usercourses"


In [2]:
client = pymongo.MongoClient(mongo_uri)
db = client[database_name]
collection = db[collection_name]
pipeline = [
    {'$addFields': {'percentNumeric': {'$convert': {'input': '$percent', 'to': 'double', 'onError': 0}}}},
    {'$match': {'percentNumeric': {'$gte': 60}}},
    {'$project': {'_id': 0, 'courseSlug': 1, 'docInfo.user': 1, 'percent': 1}}
]

course_data = collection.aggregate(pipeline)
df = pd.DataFrame(list(course_data))
client.close()

In [3]:
normalized_docInfo_df = pd.json_normalize(df['docInfo'])

In [4]:
df = pd.concat([df, normalized_docInfo_df], axis=1)


In [5]:
df.drop(columns=["docInfo"], inplace=True)

In [6]:
df['percent'] = pd.to_numeric(df['percent'], errors='coerce')

df.sort_values(by=['user', 'courseSlug'], inplace=True)
df = df[~df['courseSlug'].isin(['english_hotel', 'georgian_alphabet'])]


In [7]:
df[['language', 'level']] = df['courseSlug'].str.split('_', expand=True)

df = df[df['language'].str.isalpha()]

In [8]:

language_level_mapping = {}
for language in df['language'].unique():
    language_courses = df[df['language'] == language]['courseSlug'].unique()
    language_level_mapping[language] = {slug: i for i, slug in enumerate(sorted(language_courses))}


In [9]:
def assign_course_level(row):
    language = row['language']
    course_slug = row['courseSlug']
    return language_level_mapping[language].get(course_slug, -1)  # Returns -1 if course_slug not found

df['course_level'] = df.apply(assign_course_level, axis=1)

df = df[df['course_level'] != -1]


In [10]:
starting_levels = df.groupby(['user', 'language'])['course_level'].min().reset_index()
starting_level_counts = starting_levels.groupby(['language', 'course_level']).size().reset_index(name='user_count')

In [11]:
progression = df.groupby(['user', 'language'])['course_level'].max().reset_index()

user_progression = starting_levels.merge(progression, on=['user', 'language'], suffixes=['_start', '_max'])

progression_counts = user_progression.groupby(['language', 'course_level_start', 'course_level_max']).size().reset_index(name='user_count')


In [12]:
plot_data = {language: pd.DataFrame(index=range(len(language_level_mapping[language]))) for language in language_level_mapping.keys()}

for (language, start_level, max_level), group in progression_counts.groupby(['language', 'course_level_start', 'course_level_max']):
    plot_data[language].loc[max_level, start_level] = group['user_count'].sum()


In [13]:
reverse_language_level_mapping = {language: {i: slug for slug, i in level_map.items()} 
                                  for language, level_map in language_level_mapping.items()}

In [15]:
import matplotlib.pyplot as plt
import os

output_dir = './course-progression'
os.makedirs(output_dir, exist_ok=True)

for language, data in plot_data.items():
    plt.figure(figsize=(10, 12))
    
    ax = plt.gca()
    data_filled = data.fillna(0)

    bars = data_filled.plot(kind='bar', stacked=True, ax=ax)

    for bar in bars.patches:
        bar_height = bar.get_height()
        if bar_height > 0:  
            ax.annotate(f'{int(bar_height)}',
                        (bar.get_x() + bar.get_width() / 2, bar.get_y() + bar_height / 2),
                        ha='center', va='center',
                        color='black', xytext=(0, 0),
                        textcoords='offset points')

    plt.title(f'User Progression Funnel in {language.title()} Courses')
    plt.xlabel('Maximum Course Level Reached')
    plt.ylabel('Number of Users')
    plt.xticks(ticks=range(len(language_level_mapping[language])), labels=language_level_mapping[language].keys(), rotation=45)

    legend_labels = [reverse_language_level_mapping[language].get(level, 'Unknown') for level in range(len(language_level_mapping[language]))]
    plt.legend(legend_labels, title='Starting Course', bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.grid(axis='y')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{language}_progression.png'), bbox_inches='tight')
    plt.close()




next thing to implement - check each hear by completion of the courses to see progression and how having new course affects users