In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
!pip install PyGithub -q
from github import Github
import pandas as pd,numpy as np
from pathlib import Path
import os

# Function to load datasets
def load_dataset(base_url, dataset_names):
    """Loads multiple datasets from a base URL into a dictionary."""
    data = {}
    for name in dataset_names:
        try:
            url = f"{base_url}/{name}.csv"
            data[name] = pd.read_csv(url)
        except Exception as e:
            print(f"Error loading {name}: {e}")
    return data

# Base URLs and dataset names
paris2024_base_url = "https://raw.githubusercontent.com/Katyayani09/Datasets/main/azure_projects/olympics_data/paris2024/raw_data"
datasets_paris = ["athletes", "coaches", "medals", "teams"]

# Load datasets
paris_data = load_dataset(paris2024_base_url, datasets_paris)

# Process datasets
def process_athletes_data(df):
    df = df[['name', 'gender', 'country_long', 'disciplines']].rename(
        columns={'name': 'PersonName', 'gender': 'Gender', 'country_long': 'Country', 'disciplines': 'Discipline'})
    df['Discipline'] = df['Discipline'].astype(str).str.replace(r'[\[\]\']', '', regex=True)
    return df.drop_duplicates()

def process_coaches_data(df):
    df = df[['name', 'country_long', 'disciplines', 'events']].rename(
        columns={'name': 'Name', 'country_long': 'Country', 'disciplines': 'Discipline', 'events': 'Event'})
    return df.drop_duplicates()

def process_medals_data(df):
    total_medals = df.groupby('country_long')['medal_type'].count().reset_index().rename(
        columns={'medal_type': 'Total', 'country_long': 'TeamCountry'})
    total_medals['Rank'] = np.arange(1, len(total_medals) + 1)
    
    medal_counts = df.groupby(['country_long', 'medal_type']).size().unstack(fill_value=0).reset_index()
    medal_counts.columns = ['TeamCountry', 'Bronze', 'Gold', 'Silver']
    
    merged = total_medals.merge(medal_counts, on='TeamCountry')
    merged['Rank by Total'] = merged['Total'].rank(method='min', ascending=False).astype(int)
    
    return merged[['Rank', 'TeamCountry', 'Gold', 'Silver', 'Bronze', 'Total', 'Rank by Total']]

def process_teams_data(df):
    return df[['team', 'discipline', 'country_long', 'events']].rename(
        columns={'team': 'TeamName', 'discipline': 'Discipline', 'country_long': 'Country', 'events': 'Event'})

def process_gender_data(df):
    result = (
        df.groupby(['Discipline', 'Gender'])
        .size()
        .unstack(fill_value=0)
        .reset_index()
        .rename_axis(None, axis=1))
    result['Total'] = result['Female'] + result['Male']
    result.columns = ['Discipline', 'Female', 'Male', 'Total']
    return result
    
def process_athletes_data2(df):
    df = df[['PersonName', 'Country', 'Discipline']]
    return df.drop_duplicates()

# Clean datasets
a2_cleaned = process_athletes_data(paris_data['athletes'])
c2_cleaned = process_coaches_data(paris_data['coaches'])
m2_cleaned = process_medals_data(paris_data['medals'])
t2_cleaned = process_teams_data(paris_data['teams'])
e2_cleaned = process_gender_data(a2_cleaned)
a2_cleaned =a2_cleaned[['PersonName', 'Country', 'Discipline']]

datasets_cleaned = {
    'Athletes': a2_cleaned,
    'Coaches': c2_cleaned,
    'Medals': m2_cleaned,
    'Teams': t2_cleaned,
    'EntriesGender' : e2_cleaned
}


datasets_cleaned

def upload_dataframes_to_github(repo, dataframes, base_path, commit_message="Add processed datasets", branch="main"):
    """
    Uploads DataFrames directly to a GitHub repository.
    """
    for name, df in dataframes.items():
        try:
            file_path = f"{base_path}/{name.lower()}.csv"
            content = df.to_csv(index=False)
            existing_file = None

            # Check if the file already exists
            try:
                existing_file = repo.get_contents(file_path, ref=branch)
            except Exception:
                print(f"{file_path} does not exist. Creating new file.")

            # Overwrite if the file exists, otherwise create a new file
            if existing_file:
                repo.update_file(file_path, commit_message, content, existing_file.sha, branch=branch)
                print(f"Overwritten: {file_path}")
            else:
                repo.create_file(file_path, commit_message, content, branch=branch)
                print(f"Uploaded: {file_path}")

        except Exception as e:
            print(f"Failed to upload {name}: {e}")


# Fetch GitHub token from environment variable
github_token = os.getenv("GITHUB_ACCOUNT")
if not github_token:
    raise ValueError("GitHub token not found. Set the token as an environment variable.")

g = Github(github_token)

# Access the repository
repo_name = "Katyayani09/Datasets"
try:
    repo = g.get_repo(repo_name)
except Exception as e:
    raise ValueError(f"Failed to access repository {repo_name}: {e}")

# Upload cleaned datasets to GitHub
upload_dataframes_to_github(repo, datasets_cleaned, "azure_projects/olympics_data/paris2024/processed_data")
#upload_dataframes_to_github(repo, datasets_cleaned, "azure_projects/delete_data")
print("All files successfully uploaded to GitHub.")

##################################################################################################