In [138]:
import pandas as pd

In [139]:
df = pd.read_csv(r'../data/cleaned_dataset_step_1.csv')

In [140]:
df.head()

Unnamed: 0,Country,Repositories Count,Followers,Most Used Language,Total Stars,Account Created At,Hireable,Unique Topics,Total Repository Size (KB)
0,Singapore,30,102331,JavaScript,2980,2010-11-28T01:05:40Z,,,155931
1,San Francisco,30,44504,Python,11169,2009-07-25T19:06:27Z,,"robotics, audio-synthesis, denoising-diffusion...",679441
2,"Austin, TX",30,44094,JavaScript,23441,2009-11-08T06:56:21Z,True,"async-patterns, observables, security, arrow-f...",62877
3,"San Francisco, CA",30,36793,Python,4517,2011-12-22T09:57:32Z,True,,305704
4,"9th Ring, Vim",30,36494,JavaScript,4675,2013-05-17T15:05:59Z,,,290287


In [141]:
missing_values = df.isnull().sum()
print("Missing values per column: ")
print(missing_values)

Missing values per column: 
Country                       3662
Repositories Count              10
Followers                       10
Most Used Language            1671
Total Stars                     10
Account Created At              13
Hireable                      8247
Unique Topics                 6538
Total Repository Size (KB)      10
dtype: int64


In [142]:
# Filling missing values on Hireable
df['Hireable'] = df['Hireable'].fillna(False)

In [143]:
missing_values = df.isnull().sum()
print("Missing values per column after handling 'Hireable': ")
print(missing_values)

Missing values per column after handling 'Hireable': 
Country                       3662
Repositories Count              10
Followers                       10
Most Used Language            1671
Total Stars                     10
Account Created At              13
Hireable                         0
Unique Topics                 6538
Total Repository Size (KB)      10
dtype: int64


In [144]:
# checking if the missing values in Repositories Count, Total Repository Size (KB), Total Stars and Followers are the same
missing_rows = df[df['Total Repository Size (KB)'].isnull() & df['Repositories Count'].isnull() & df['Total Stars'].isnull() & df['Followers'].isnull()]
print("Total count of missing Top Repo, Repo count, total stars and followers: ",len(missing_rows))

Total count of missing Top Repo, Repo count, total stars and followers:  10


In [145]:
# Dropping rows which have all 4 attributes missing
df = df.dropna(subset=['Total Repository Size (KB)','Repositories Count','Total Stars', 'Followers'], how = 'all')

In [146]:
missing_values = df.isnull().sum()
print("Missing values per column after hadling 'Total Repository Size (KB)','Repositories Count','Total Stars' and 'Followers': ")
print(missing_values)

Missing values per column after hadling 'Total Repository Size (KB)','Repositories Count','Total Stars' and 'Followers': 
Country                       3652
Repositories Count               0
Followers                        0
Most Used Language            1661
Total Stars                      0
Account Created At               3
Hireable                         0
Unique Topics                 6528
Total Repository Size (KB)       0
dtype: int64


In [147]:
# Dropping rows which have 'Account Created At' row missing
df = df.dropna(subset=['Account Created At'])

In [148]:
missing_values = df.isnull().sum()
print("Missing values per column after hadling 'Account Created At': ")
print(missing_values)

Missing values per column after hadling 'Account Created At': 
Country                       3652
Repositories Count               0
Followers                        0
Most Used Language            1658
Total Stars                      0
Account Created At               0
Hireable                         0
Unique Topics                 6525
Total Repository Size (KB)       0
dtype: int64


In [149]:
# Manually convert columns to numeric, coercing errors to NaN
df['Repositories Count'] = pd.to_numeric(df['Repositories Count'], errors='coerce')
df['Followers'] = pd.to_numeric(df['Followers'], errors='coerce')
df['Total Stars'] = pd.to_numeric(df['Total Stars'], errors='coerce')
df['Total Repository Size (KB)'] = pd.to_numeric(df['Total Repository Size (KB)'], errors='coerce')

# Convert 'Account Created At' to datetime, coercing errors to NaT (Not a Time)
df['Account Created At'] = pd.to_datetime(df['Account Created At'], errors='coerce')

# Check missing values after conversion
missing_values = df.isnull().sum()
print("Missing values per column after handling conversions:")
print(missing_values)



Missing values per column after handling conversions:
Country                       3652
Repositories Count               1
Followers                        1
Most Used Language            1658
Total Stars                      1
Account Created At               1
Hireable                         0
Unique Topics                 6525
Total Repository Size (KB)       1
dtype: int64


In [150]:
# checking if the missing values in Repositories Count, Total Repository Size (KB), Total Stars and Followers are the same
missing_rows = df[df['Total Repository Size (KB)'].isnull() & df['Repositories Count'].isnull() & df['Total Stars'].isnull() & df['Followers'].isnull() & df['Account Created At'].isnull()]
print("Total count of missing Top Repo, Repo count, total stars and followers: ",len(missing_rows))

Total count of missing Top Repo, Repo count, total stars and followers:  1


In [151]:
# Dropping rows which have all 4 attributes missing
df = df.dropna(subset=['Total Repository Size (KB)','Repositories Count','Total Stars', 'Followers','Account Created At'], how = 'all')

In [152]:
missing_values = df.isnull().sum()
print("Missing values per column after hadling all coreced values: ")
print(missing_values)

Missing values per column after hadling 'Account Created At': 
Country                       3652
Repositories Count               0
Followers                        0
Most Used Language            1658
Total Stars                      0
Account Created At               0
Hireable                         0
Unique Topics                 6525
Total Repository Size (KB)       0
dtype: int64


In [153]:
df.to_csv('../data/cleaned_dataset_step_2.csv', index=False)
