In [32]:
import pandas as pd
from collections import Counter
import ast

In [33]:
df = pd.read_csv('./finalData/final_preprocessed.csv', encoding='latin1')

In [34]:
df['Tags'].head(2)

0                    ['flex', 'actionscript-3', 'air']
1    ['svn', 'tortoisesvn', 'branch', 'branching-an...
Name: Tags, dtype: object

In [35]:
# Convert strings to actual lists safely. The Tags objects are json objects, we will convert them to python list
def to_list(val):
    try:
        return ast.literal_eval(val)
    except:
        return []

df['Tags'] = df['Tags'].apply(to_list)

In [36]:
df['Tags'].head(2)

0                          [flex, actionscript-3, air]
1    [svn, tortoisesvn, branch, branching-and-merging]
Name: Tags, dtype: object

In [38]:
# Flatten and count tag frequencies
all_tags = [tag for tags in df['Tags'] for tag in tags]
tag_counts = Counter(all_tags)

In [40]:
# Get the top 10 tags
top_10_tags = [tag for tag, count in tag_counts.most_common(10)]
print("Top 10 tags:", top_10_tags)

Top 10 tags: ['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python', 'html', 'c++', 'ios']


In [41]:
# Keep only those tags in top 10 for each row
df['Tags'] = df['Tags'].apply(lambda tags: [tag for tag in tags if tag in top_10_tags])

In [42]:
# Drop rows with no top 10 tags
df = df[df['Tags'].map(len) > 0]

In [43]:
# Save to file
df.to_csv('top_10_filtered_data.csv', index=False)