# GitHub Repository Analysis Using Fictional API

This notebook connects to a fictional API that provides information about GitHub repositories over time. We will fetch data from the API, process it, and analyze the repository growth, average stars, and identify the most popular repositories.

In [1]:
# Import necessary libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set the style for the plots
sns.set(style="whitegrid")

In [2]:
# Step 1: Define the API endpoint and fetch data
API_URL = "https://api.fictionalgithub.com/repos"

# Example: Fetch repositories data over time (fictional endpoint)
def fetch_repo_data(year):
    """
    Fetch repository data for a given year from the fictional API.
    For this example, we simulate the API returning a list of repositories.
    """
    params = {'year': year}
    response = requests.get(API_URL, params=params)
    
    if response.status_code == 200:
        return response.json()  # Assuming the API returns a JSON list of repositories
    else:
        print(f"Error fetching data for {year}: {response.status_code}")
        return []

# Fetch data for the last 5 years (fictional data)
years = [2018, 2019, 2020, 2021, 2022]
repo_data = []

for year in years:
    data = fetch_repo_data(year)
    repo_data.extend(data)

In [3]:
# Step 2: Process the data into a DataFrame
# Assuming the API returns data with fields like 'repo_name', 'created_at', 'stars', 'forks'
df = pd.DataFrame(repo_data)

# Display the first few rows of the data
df.head()

In [4]:
# Step 3: Convert created_at to datetime
df['created_at'] = pd.to_datetime(df['created_at'])

# Step 4: Extract the year of creation from the 'created_at' column
df['created_year'] = df['created_at'].dt.year

# Step 5: Basic analysis
# Number of repositories created per year
repos_per_year = df.groupby('created_year').size()

# Display the result
repos_per_year

In [5]:
# Step 6: Visualize the repository growth over the years
plt.figure(figsize=(10, 6))
sns.lineplot(x=repos_per_year.index, y=repos_per_year.values, marker='o')
plt.title('Growth of GitHub Repositories Over Time', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Repositories Created', fontsize=12)
plt.xticks(years)
plt.show()

created_year
2018     45
2019     62
2020     84
2021    102
2022    115
dtype: int64

In [6]:
# Step 7: Additional analysis - Calculate the average number of stars per year
avg_stars_per_year = df.groupby('created_year')['stars'].mean()

# Visualize the average stars per year
plt.figure(figsize=(10, 6))
sns.barplot(x=avg_stars_per_year.index, y=avg_stars_per_year.values)
plt.title('Average Stars per Repository Over Time', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Stars', fontsize=12)
plt.show()

In [7]:
# Step 8: Identify trends - Top 10 repositories with the most stars
top_repos_by_stars = df[['repo_name', 'stars']].sort_values(by='stars', ascending=False).head(10)

top_repos_by_stars