# Data Preprocessing 

In [40]:
from pymongo import MongoClient
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()  # Loads environment variables from .env

mongo_uri = os.getenv("MONGO_URI")
if mongo_uri is None:
    raise Exception("MONGO_URI not found in environment variables")

client = MongoClient(mongo_uri)

db = client["github_db"]
collection = db["user_data"]

data = list(collection.find())
df = pd.DataFrame(data)
print(df.head())


                        _id     Login  \
0  683c7d2cc5036e7ccfba1afe   mojombo   
1  683c8a92c5036e7ccfba1b03   pjhyett   
2  683c8af6c5036e7ccfba1b04    wycats   
3  683c8b32c5036e7ccfba1b05  ezmobius   
4  683c8c14c5036e7ccfba1b06   defunkt   

                                      Avatar URL   Bio            Created At  \
0  https://avatars.githubusercontent.com/u/1?v=4        2007-10-20T05:24:19Z   
1  https://avatars.githubusercontent.com/u/3?v=4  None  2008-01-07T17:54:22Z   
2  https://avatars.githubusercontent.com/u/4?v=4  None  2008-01-12T05:38:33Z   
3  https://avatars.githubusercontent.com/u/5?v=4  None  2008-01-12T07:51:46Z   
4  https://avatars.githubusercontent.com/u/2?v=4     🍔  2007-10-20T05:24:19Z   

   Followers Count                                     Followers List  \
0            24221  [defunkt, pjhyett, vanpelt, brynary, technowee...   
1             8333  [mojombo, defunkt, technoweenie, takeo, atmos,...   
2            10278  [topfunky, joshknowles, rsanheim,

Checking the List of columns present in the data set. 

In [41]:

df.columns

Index(['_id', 'Login', 'Avatar URL', 'Bio', 'Created At', 'Followers Count',
       'Followers List', 'Following Count', 'Following List', 'Languages',
       'Name', 'Profile URL', 'Public Repositories', 'Total Commits',
       'Updated At', 'Organizations', 'Starred Repositories', 'Subscriptions',
       'Commit Dates', 'Commits Per Language', 'Commits Per Repo', 'Platforms',
       'Stars Per Language', 'Stars Per Repo', 'Web Frameworks'],
      dtype='object')

Checking the Shape of the Data set.

In [42]:
df.shape

(52, 25)

Checking for the null values 

In [43]:


df.isnull().sum()

_id                      0
Login                    0
Avatar URL               0
Bio                     30
Created At               0
Followers Count          0
Followers List           2
Following Count          0
Following List           2
Languages                0
Name                     4
Profile URL              0
Public Repositories      0
Total Commits            0
Updated At               2
Organizations            0
Starred Repositories     0
Subscriptions            0
Commit Dates            51
Commits Per Language    49
Commits Per Repo        49
Platforms               49
Stars Per Language      49
Stars Per Repo          49
Web Frameworks          49
dtype: int64

Replacing the Null Values with 0

In [44]:


df.fillna(0, inplace=True)  # Or use more specific imputations

In [45]:
df['Languages']

0     {'CSS': 22033, 'JavaScript': 25850, 'Ruby': 56...
1                  {'Ruby': 53587, 'JavaScript': 26152}
2     {'Ruby': 4689292, 'Rust': 233807, 'CoffeeScrip...
3     {'Ruby': 1140441, 'C': 3134519, 'Scheme': 4915...
4     {'JavaScript': 4906512, 'Python': 9982, 'Objec...
5     {'C++': 53390, 'Starlark': 3472, 'JavaScript':...
6     {'Go': 59818975, 'C': 1028866, 'Shell': 616789...
7     {'Ruby': 235022, 'TypeScript': 1819206, 'Shell...
8     {'Shell': 229750, 'Makefile': 20454, 'CMake': ...
9     {'Ruby': 7602283, 'Gherkin': 64610, 'SCSS': 53...
10    {'Java': 17440940, 'Starlark': 93090, 'C': 937...
11    {'Ruby': 1494392, 'HTML': 65991, 'Go': 4442744...
12    {'JavaScript': 336396, 'Makefile': 314, 'Ruby'...
13    {'JavaScript': 647547, 'PHP': 111910, 'HTML': ...
14    {'JavaScript': 474219, 'Svelte': 10385, 'TypeS...
15    {'Ruby': 454087, 'R': 913208, 'CSS': 21737, 'J...
16    {'JavaScript': 3236926, 'HTML': 157577, 'CSS':...
17    {'JavaScript': 4375, 'HTML': 1721, 'CSS': 

In [46]:
# Step 1: Create 'Languages_list' safely, handling missing or non-dict values
df['Languages_list'] = df['Languages'].apply(lambda x: list(x.keys()) if isinstance(x, dict) else [])

# Check the new column
print(df['Languages_list'].head())


0    [CSS, JavaScript, Ruby, Erlang, Unknown, TypeS...
1                                   [Ruby, JavaScript]
2    [Ruby, Rust, CoffeeScript, JavaScript, Vue, CS...
3    [Ruby, C, Scheme, Perl, Shell, JavaScript, C++...
4    [JavaScript, Python, Objective-C, Lua, Clojure...
Name: Languages_list, dtype: object


One-hot Encoding the Language column

In [47]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
languages_encoded = pd.DataFrame(mlb.fit_transform(df['Languages_list']),
                                 columns=mlb.classes_,
                                 index=df.index)  # Make sure index matches

# Concatenate encoded columns with original dataframe
df = pd.concat([df, languages_encoded], axis=1)

# Now you can drop 'Languages' and 'Languages_list' if you want
df = df.drop(columns=['Languages', 'Languages_list'])

print(df.columns)  # To check if encoded columns are now included


Index(['_id', 'Login', 'Avatar URL', 'Bio', 'Created At', 'Followers Count',
       'Followers List', 'Following Count', 'Following List', 'Name',
       ...
       'Vim Script', 'Visual Basic', 'Vue', 'WebAssembly', 'XML', 'XSLT',
       'Xonsh', 'YASnippet', 'Yacc', 'reStructuredText'],
      dtype='object', length=149)


In [48]:
print(df.shape)
print(languages_encoded.shape)


(52, 149)
(52, 125)


Training the Data Using Cosine Similarity

In [49]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


In [51]:
from sklearn.preprocessing import MinMaxScaler

# Select relevant numeric columns
numeric_cols = ['Followers Count', 'Following Count', 'Public Repositories', 'Total Commits']
# Optional: Scale them to the same range
scaler = MinMaxScaler()
numeric_scaled = pd.DataFrame(scaler.fit_transform(df[numeric_cols]), columns=numeric_cols, index=df.index)

# Combine with language encoding
feature_matrix = pd.concat([language_features, numeric_scaled], axis=1)


In [52]:
# Compute cosine similarity between rows
similarity_matrix = pd.DataFrame(cosine_similarity(feature_matrix), index=df['Login'], columns=df['Login'])


In [54]:
import pickle

# similarity_matrix is your cosine similarity DataFrame
with open('similarity_model.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)
