# Importing required modules

In [74]:
import requests
import pandas as pd
from datetime import datetime
import statsmodels.api as sm
from scipy.stats import pearsonr

# Extracting Data from github

In [None]:
# Set up GitHub API and authentication
GITHUB_TOKEN = 'user your token here'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
BASE_URL = 'https://api.github.com'

# Helper function to clean up company names
def clean_company(company):
    if company:
        return company.strip().replace('@', '').upper()
    return None

# Function to get users from Hyderabad with over 50 followers
def fetch_users(location="Hyderabad", min_followers=50):
    users = []
    url = f"{BASE_URL}/search/users"
    params = {
        "q": f"location:{location} followers:>{min_followers}",
        "per_page": 100
    }

    while url:
        response = requests.get(url, headers=HEADERS, params=params)
        data = response.json()

        for item in data.get('items', []):
            user_data = requests.get(f"{BASE_URL}/users/{item['login']}", headers=HEADERS).json()
            users.append({
                'login': user_data.get('login'),
                'name': user_data.get('name'),
                'company': clean_company(user_data.get('company')),
                'location': user_data.get('location'),
                'email': user_data.get('email'),
                'hireable': user_data.get('hireable'),
                'bio': user_data.get('bio'),
                'public_repos': user_data.get('public_repos'),
                'followers': user_data.get('followers'),
                'following': user_data.get('following'),
                'created_at': user_data.get('created_at')
            })

        # Get the 'next' link for pagination if available
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None
    return users

# Function to get repositories for each user
def fetch_repositories(username):
    repos = []
    url = f"{BASE_URL}/users/{username}/repos"
    params = {"per_page": 100, "sort": "pushed"}

    while url:
        response = requests.get(url, headers=HEADERS, params=params)
        data = response.json()

        for repo in data:
            repos.append({
                'login': username,
                'full_name': repo.get('full_name'),
                'created_at': repo.get('created_at'),
                'stargazers_count': repo.get('stargazers_count'),
                'watchers_count': repo.get('watchers_count'),
                'language': repo.get('language'),
                'has_projects': repo.get('has_projects'),
                'has_wiki': repo.get('has_wiki'),
                'license_name': repo.get('license')['key'] if repo.get('license') else None  # Handle None license
            })
            if len(repos) >= 500:
                break

        # Get the 'next' link for pagination if available
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None
    return repos

# Main script to fetch data and save to CSV
def main():
    # Fetch users
    print("Fetching users...")
    users = fetch_users()
    users_df = pd.DataFrame(users)
    users_df.to_csv("users.csv", index=False)
    print("Saved users to users.csv")

    # Fetch repositories for each user
    print("Fetching repositories...")
    all_repos = []
    for user in users:
        repos = fetch_repositories(user['login'])
        all_repos.extend(repos)

    # Save to repositories.csv
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv("repositories.csv", index=False)
    print("Saved repositories to repositories.csv")

if __name__ == "__main__":
    main()


Fetching users...
Saved users to users.csv
Fetching repositories...
Saved repositories to repositories.csv


# Reading the extracting data

In [3]:
users=pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,iam-veeramalla,Abhishek Veeramalla,RED HAT,"Hyderabad, India",,,"Keep learning, sharing and growing || Principa...",45,16231,1,2018-09-19
1,in28minutes,,IN28MINUTES,"Hyderabad, India",in28minutes@gmail.com,True,"Helping 1 Million Learners learn Programming, ...",102,14368,0,2015-09-05
2,stacksimplify,STACKSIMPLIFY,STACKSIMPLIFY,Hyderabad,stacksimplify@gmail.com,,"Best Selling Instructor on Udemy - 2,10,000 St...",47,3233,0,2019-03-07
3,thenaveensaggam,NAVEEN SAGGAM,HTTPS://WWW.UIBRAINS.COM,Hyderabad,thenaveensaggam@gmail.com,,Founder: UiBrains Technologies\r\nEnthusiastic...,43,2164,1,2017-02-18
4,MadhavBahl,MADHAV BAHL,MICROSOFT,"Hyderabad, India",madhavbahl10@gmail.com,True,The Lean Programmer | Software Engineer @Micro...,128,1589,1,2017-03-04


In [4]:
repos=pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,iam-veeramalla,iam-veeramalla/observability-zero-to-hero,2023-03-16T14:41:25Z,1502,1502,Go,True,True,
1,iam-veeramalla,iam-veeramalla/community-operators-prod,2023-08-16T17:04:56Z,106,106,Dockerfile,True,False,apache-2.0
2,iam-veeramalla,iam-veeramalla/community-operators,2023-08-11T18:48:35Z,101,101,Dockerfile,True,False,apache-2.0
3,iam-veeramalla,iam-veeramalla/MERN-docker-compose,2024-08-31T17:17:07Z,82,82,JavaScript,True,True,
4,iam-veeramalla,iam-veeramalla/argocd-operator,2023-07-03T09:46:13Z,238,238,,True,False,apache-2.0


# Question-1

Who are the top 5 users in Hyderabad with the highest number of followers? List their login in order, comma-separated.

Users

In [63]:
top_5_users=users.sort_values(by='followers',ascending=False)
top_5_logins = ','.join(top_5_users['login'].head(5))
print('Answer:',top_5_logins)

Answer: iam-veeramalla,in28minutes,stacksimplify,thenaveensaggam,MadhavBahl


# Question-2

Who are the 5 earliest registered GitHub users in Hyderabad? List their login in ascending order of created_at, comma-separated.

Users

In [64]:
users['created_at']=pd.to_datetime(users['created_at'])
earliest_registered_users=users.sort_values(by='created_at')
early_users = ','.join(earliest_registered_users['login'].head(5))
print('Answer:',early_users)

Answer: shabda,sitaramc,bagwanpankaj,srikanthlogic,kulbirsaini


# Question-3

 What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

 Licenses

In [65]:
top_licenses=repos['license_name'].value_counts().reset_index()
top_3_licenses=','.join(top_licenses['license_name'].head(3))
print('Answer:',top_3_licenses)

Answer: mit,apache-2.0,other


# Question - 4

Which company do the majority of these developers work at?

Company (cleaned up as explained above)

In [11]:
users.groupby('company')['login'].count().reset_index().sort_values(by='login',ascending=False)

Unnamed: 0,company,login
97,IIIT HYDERABAD,17
146,MICROSOFT,17
78,GOOGLE,12
119,INTERNATIONAL INSTITUTE OF INFORMATION TECHNOL...,8
189,SALESFORCE,5
...,...,...
87,HTTPS://LEARNDEVOPSONLINE.COM,1
88,HTTPS://RAJESHKARRA.ACADEMIA.EDU/,1
89,HTTPS://WWW.UIBRAINS.COM,1
90,HUMAN SCIENCES GROUP @ IIITH,1


IIT HYDERABAD is not a company. So, MICROSOFT should be the answer.

# Question - 5

Which programming language is most popular among these users?

Language

In [14]:
repos.groupby('language')['login'].count().reset_index().sort_values(by='login',ascending=False).reset_index(drop=True)['language'][0]

'JavaScript'

# Question - 6

Which programming language is the second most popular among users who joined after 2020?

In [55]:
after_2020=users[users['created_at'].dt.year>2020]
merged_df=pd.merge(after_2020,repos,on='login',how='inner')
merged_df['language'].value_counts().reset_index()['language'][1]

'HTML'

# Question - 7

Which language has the highest average number of stars per repository?

Language

In [17]:
repos.groupby('language')['stargazers_count'].mean().reset_index().sort_values(by='stargazers_count',ascending=False).reset_index(drop=True)['language'][0]

'Perl'

# Question - 8

Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

User Login

In [66]:
leaders=users
leaders['leader_strength']=leaders['followers']/(1+leaders['following'])
top_leaders=leaders.sort_values(by='leader_strength',ascending=False).reset_index(drop=True)
top_5_leaders=','.join(top_leaders['login'].head(5))
print('Answer:',top_5_leaders)

Answer: in28minutes,iam-veeramalla,stacksimplify,ashokitschool,thenaveensaggam


# Question - 9

 What is the correlation between the number of followers and the number of public repositories among users in Hyderabad?

 Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [20]:
round(users['public_repos'].corr(users['followers']),3)

0.006

# Question - 10

 Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

 Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [68]:
# Step 1: Define the dependent and independent variables
X = users['public_repos']  # Independent variable (public_repos)
y = users['followers']      # Dependent variable (followers)

# Step 2: Add a constant to the independent variable for the regression model
X = sm.add_constant(X)

# Step 3: Fit the regression model
model = sm.OLS(y, X).fit()

# Print the regression results
print(model.summary())

# Step 4: Get the coefficient for public_repos
additional_followers_per_repo = model.params['public_repos']
print(f"Additional followers per public repository: {additional_followers_per_repo:.3f}")


                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                   0.01987
Date:                Sat, 02 Nov 2024   Prob (F-statistic):              0.888
Time:                        17:27:23   Log-Likelihood:                -4187.9
No. Observations:                 504   AIC:                             8380.
Df Residuals:                     502   BIC:                             8388.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          201.3247     56.601      3.557   

# Question - 11
Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [75]:
correlation, p_value = pearsonr(repos['has_projects'], repos['has_wiki'])
print(correlation)

0.17244978508860107


In [73]:
(repos['has_projects'].astype('int')).corr(repos['has_wiki'].astype('int'))

0.17244978508860023

# Question - 12

Do hireable users follow more people than those who are not hireable?

Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

# Question - 13

Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

# Question - 14

Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

Users login

In [76]:
repos['created_at']=pd.to_datetime(repos['created_at'])
weekend_repos=repos[repos['created_at'].dt.weekday>=5]
top_weekenders=weekend_repos['login'].value_counts().reset_index()
top_5_weekenders=','.join(top_weekenders['login'].head())
print('Answer:',top_5_weekenders)

Answer: hemanth22,anjijava16,wahidKhan74,elevenpassin,narasimhavuppala


# Question - 15

 Do people who are hireable share their email addresses more often?

[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [33]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   login            504 non-null    object        
 1   name             498 non-null    object        
 2   company          318 non-null    object        
 3   location         504 non-null    object        
 4   email            264 non-null    object        
 5   hireable         195 non-null    object        
 6   bio              416 non-null    object        
 7   public_repos     504 non-null    int64         
 8   followers        504 non-null    int64         
 9   following        504 non-null    int64         
 10  created_at       504 non-null    datetime64[ns]
 11  leader_strength  504 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory usage: 47.4+ KB


In [43]:
hireable_with_email_count=len(users[(users['hireable']==True) & (users['email'].notna())])
non_hireable_with_email_count=len(users[(users['hireable']!=True) & (users['email'].notna())])
round(hireable_with_email_count/len(users)-non_hireable_with_email_count/len(users),3)

0.004

# Question - 16

Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

Most common surname(s)

In [29]:
users_name=users['name'].str.split(' ',expand=True).rename({0:'first_name',1:'surname'},axis=1)
users_name.groupby('surname')['first_name'].count().reset_index().sort_values(by='first_name',ascending=False).reset_index(drop=True)['surname'][0]

'Kumar'