In [1]:
# importing libraries
import pandas as pd
import mysql.connector
import requests


In [2]:
import requests
import time
from requests.exceptions import RequestException

def fetch_trending_repositories(topics, max_pages=10, per_page=100, retries=3, backoff_factor=1):
    data_list = []
    search_url = 'https://api.github.com/search/repositories'
    token = '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'
    headers = {
        'Accept': 'application/vnd.github.v3+json',
        'Authorization': f'token {token}'  # Add token for authentication
    }

    for topic in topics:
        for page in range(1, max_pages + 1):
            params = {
                'q': topic,
                'sort': 'stars',
                'per_page': per_page,
                'page': page
            }

            for attempt in range(retries):
                try:
                    response = requests.get(search_url, headers=headers, params=params)
                    if response.status_code == 403:
                        print(f"Rate limit hit or forbidden for topic {topic}. Retry after some time.")
                        print(f"Remaining API requests: {response.headers.get('X-RateLimit-Remaining')}")
                        time.sleep(60)  # Wait for rate limit reset
                        continue
                    response.raise_for_status()  # Check for HTTP errors
                    data = response.json()['items']
                    if not data:
                        break

                    for item in data:
                        data_list.append({
                            'topics':topic,
                            'Repository_Name': item['name'],
                            'Owner': item['owner']['login'],
                            'Description': item['description'],
                            'URL': item['html_url'],
                            'Programming_Language': item.get('language', 'Unknown'),
                            'Creation_Date': item['created_at'],
                            'Last_Updated_Date': item['updated_at'],
                            'Number_of_Stars': item['stargazers_count'],
                            'Number_of_Forks': item['forks_count'],
                            'Number_of_Open_Issues': item['open_issues_count'],
                            'License_Type': item['license']['name'] if item['license'] else 'No License'
                        })
                    break  # Exit retry loop if successful
                except RequestException as e:
                    print(f"Error fetching repositories for topic {topic} on page {page}: {e}")
                    time.sleep(backoff_factor * (2 ** attempt))  # Exponential backoff
                    continue
    
    return data_list


In [3]:
# List of trending topics
trending_topics = [
    'machine learning', 'data visualization', 'deep learning', 
    'natural language processing', 'data science', 'artificial intelligence', 
    'big data', 'computer vision', 'data mining', 'neural networks'
]

# Call the single function to get repository data
data_list = fetch_trending_repositories(trending_topics)

# convert to dataframe
df = pd.DataFrame(data_list)
df

Unnamed: 0,topics,Repository_Name,Owner,Description,URL,Programming_Language,Creation_Date,Last_Updated_Date,Number_of_Stars,Number_of_Forks,Number_of_Open_Issues,License_Type
0,machine learning,tensorflow,tensorflow,An Open Source Machine Learning Framework for ...,https://github.com/tensorflow/tensorflow,C++,2015-11-07T01:19:20Z,2024-10-08T11:18:23Z,185875,74234,5109,Apache License 2.0
1,machine learning,transformers,huggingface,🤗 Transformers: State-of-the-art Machine Learn...,https://github.com/huggingface/transformers,Python,2018-10-29T13:56:00Z,2024-10-08T11:20:21Z,133061,26554,1452,Apache License 2.0
2,machine learning,ML-For-Beginners,microsoft,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",https://github.com/microsoft/ML-For-Beginners,HTML,2021-03-03T01:34:05Z,2024-10-08T10:28:21Z,69346,14349,9,MIT License
3,machine learning,funNLP,fighting41love,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,https://github.com/fighting41love/funNLP,Python,2018-08-21T11:20:39Z,2024-10-08T11:07:47Z,68112,14442,28,No License
4,machine learning,awesome-machine-learning,josephmisiti,A curated list of awesome Machine Learning fra...,https://github.com/josephmisiti/awesome-machin...,Python,2014-07-15T19:11:19Z,2024-10-08T10:16:32Z,65624,14597,7,Other
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,neural networks,nea,nusnlp,Neural Essay Assessor: An Automated Essay Scor...,https://github.com/nusnlp/nea,Python,2016-09-26T05:07:42Z,2024-09-28T13:30:44Z,205,71,9,GNU General Public License v3.0
9996,neural networks,stylenet,machrisaa,Neural Network with Style Synthesis,https://github.com/machrisaa/stylenet,Python,2016-03-27T17:52:26Z,2024-07-19T04:06:41Z,205,37,3,No License
9997,neural networks,Conv-Nets-Series,colah,A series of blog posts on convolutional neural...,https://github.com/colah/Conv-Nets-Series,,2014-06-21T15:54:15Z,2024-03-10T12:04:59Z,205,65,3,No License
9998,neural networks,spinn,stanfordnlp,SPINN (Stack-augmented Parser-Interpreter Neur...,https://github.com/stanfordnlp/spinn,Python,2015-10-14T21:52:47Z,2024-08-25T00:23:01Z,205,86,1,MIT License


In [4]:
# shape of dataframe
df.shape

(10000, 12)

## Data cleaning

In [5]:
# info of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   topics                 10000 non-null  object
 1   Repository_Name        10000 non-null  object
 2   Owner                  10000 non-null  object
 3   Description            9733 non-null   object
 4   URL                    10000 non-null  object
 5   Programming_Language   8735 non-null   object
 6   Creation_Date          10000 non-null  object
 7   Last_Updated_Date      10000 non-null  object
 8   Number_of_Stars        10000 non-null  int64 
 9   Number_of_Forks        10000 non-null  int64 
 10  Number_of_Open_Issues  10000 non-null  int64 
 11  License_Type           10000 non-null  object
dtypes: int64(3), object(9)
memory usage: 937.6+ KB


In [6]:
df.dropna(subset=["Programming_Language"], inplace=True)


In [7]:
# # Update Nan of  'Description' with  No Description
df["Description"]=df["Description"].fillna("No Description")
# df["Programming_Language"]=df["Programming_Language"].fillna("No Programming_Language")

In [8]:
# Convert creation and updated dates to datetime format
df['Creation_Date'] = pd.to_datetime(df['Creation_Date'])
df['Last_Updated_Date'] = pd.to_datetime(df['Last_Updated_Date'])
df.head()

Unnamed: 0,topics,Repository_Name,Owner,Description,URL,Programming_Language,Creation_Date,Last_Updated_Date,Number_of_Stars,Number_of_Forks,Number_of_Open_Issues,License_Type
0,machine learning,tensorflow,tensorflow,An Open Source Machine Learning Framework for ...,https://github.com/tensorflow/tensorflow,C++,2015-11-07 01:19:20+00:00,2024-10-08 11:18:23+00:00,185875,74234,5109,Apache License 2.0
1,machine learning,transformers,huggingface,🤗 Transformers: State-of-the-art Machine Learn...,https://github.com/huggingface/transformers,Python,2018-10-29 13:56:00+00:00,2024-10-08 11:20:21+00:00,133061,26554,1452,Apache License 2.0
2,machine learning,ML-For-Beginners,microsoft,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",https://github.com/microsoft/ML-For-Beginners,HTML,2021-03-03 01:34:05+00:00,2024-10-08 10:28:21+00:00,69346,14349,9,MIT License
3,machine learning,funNLP,fighting41love,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,https://github.com/fighting41love/funNLP,Python,2018-08-21 11:20:39+00:00,2024-10-08 11:07:47+00:00,68112,14442,28,No License
4,machine learning,awesome-machine-learning,josephmisiti,A curated list of awesome Machine Learning fra...,https://github.com/josephmisiti/awesome-machin...,Python,2014-07-15 19:11:19+00:00,2024-10-08 10:16:32+00:00,65624,14597,7,Other


In [9]:
df.columns

Index(['topics', 'Repository_Name', 'Owner', 'Description', 'URL',
       'Programming_Language', 'Creation_Date', 'Last_Updated_Date',
       'Number_of_Stars', 'Number_of_Forks', 'Number_of_Open_Issues',
       'License_Type'],
      dtype='object')

In [10]:
df["Programming_Language"].unique()

array(['C++', 'Python', 'HTML', 'Jupyter Notebook', 'JavaScript', 'C',
       'C#', 'TypeScript', 'Java', 'Scala', 'Go', 'CSS', 'TeX', 'Swift',
       'MATLAB', 'Markdown', 'PowerShell', 'Rust', 'R', 'Julia', 'Ruby',
       'PHP', 'Haskell', 'Kotlin', 'Clojure', 'Matlab', 'Terra', 'Cuda',
       'Makefile', 'TSQL', 'Common Lisp', 'Shell', 'Objective-C',
       'Elixir', 'OpenEdge ABL', 'Perl', 'Vue', 'Mojo', 'Lean', 'Dart',
       'HCL', 'Elm', 'Svelte', 'ActionScript', 'Processing', 'F#',
       'Emacs Lisp', 'Starlark', 'Dockerfile', 'Pascal', 'Stata',
       'PLpgSQL', 'LiveScript', 'GDScript', 'Mathematica', 'VBA',
       'CoffeeScript', 'PureBasic', 'MDX', 'Nim', 'Lua', 'Fortran',
       'Crystal', 'SCSS', 'Smalltalk', 'Roff', 'Cython', 'Haxe', 'CLIPS',
       'OCaml', 'Lex', 'q', 'Apex', 'Smarty', 'Cypher', 'QML', 'Cool',
       'ReScript', 'Racket', 'Bicep', 'Nix', 'nesC', 'Scheme', 'V',
       'Objective-J', 'CMake', 'WebAssembly', 'Idris', 'HLSL',
       'Rich Text Format', 'N

In [11]:
# info of dataframe after data cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8735 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   topics                 8735 non-null   object             
 1   Repository_Name        8735 non-null   object             
 2   Owner                  8735 non-null   object             
 3   Description            8735 non-null   object             
 4   URL                    8735 non-null   object             
 5   Programming_Language   8735 non-null   object             
 6   Creation_Date          8735 non-null   datetime64[ns, UTC]
 7   Last_Updated_Date      8735 non-null   datetime64[ns, UTC]
 8   Number_of_Stars        8735 non-null   int64              
 9   Number_of_Forks        8735 non-null   int64              
 10  Number_of_Open_Issues  8735 non-null   int64              
 11  License_Type           8735 non-null   object             
dt

In [12]:
df.shape

(8735, 12)

In [13]:
df

Unnamed: 0,topics,Repository_Name,Owner,Description,URL,Programming_Language,Creation_Date,Last_Updated_Date,Number_of_Stars,Number_of_Forks,Number_of_Open_Issues,License_Type
0,machine learning,tensorflow,tensorflow,An Open Source Machine Learning Framework for ...,https://github.com/tensorflow/tensorflow,C++,2015-11-07 01:19:20+00:00,2024-10-08 11:18:23+00:00,185875,74234,5109,Apache License 2.0
1,machine learning,transformers,huggingface,🤗 Transformers: State-of-the-art Machine Learn...,https://github.com/huggingface/transformers,Python,2018-10-29 13:56:00+00:00,2024-10-08 11:20:21+00:00,133061,26554,1452,Apache License 2.0
2,machine learning,ML-For-Beginners,microsoft,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",https://github.com/microsoft/ML-For-Beginners,HTML,2021-03-03 01:34:05+00:00,2024-10-08 10:28:21+00:00,69346,14349,9,MIT License
3,machine learning,funNLP,fighting41love,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,https://github.com/fighting41love/funNLP,Python,2018-08-21 11:20:39+00:00,2024-10-08 11:07:47+00:00,68112,14442,28,No License
4,machine learning,awesome-machine-learning,josephmisiti,A curated list of awesome Machine Learning fra...,https://github.com/josephmisiti/awesome-machin...,Python,2014-07-15 19:11:19+00:00,2024-10-08 10:16:32+00:00,65624,14597,7,Other
...,...,...,...,...,...,...,...,...,...,...,...,...
9994,neural networks,Pytorch-Quaternion-Neural-Networks,Orkis-Research,This repository is an update to all previous r...,https://github.com/Orkis-Research/Pytorch-Quat...,Python,2018-09-28 09:03:15+00:00,2024-10-04 15:52:37+00:00,205,54,5,GNU General Public License v3.0
9995,neural networks,nea,nusnlp,Neural Essay Assessor: An Automated Essay Scor...,https://github.com/nusnlp/nea,Python,2016-09-26 05:07:42+00:00,2024-09-28 13:30:44+00:00,205,71,9,GNU General Public License v3.0
9996,neural networks,stylenet,machrisaa,Neural Network with Style Synthesis,https://github.com/machrisaa/stylenet,Python,2016-03-27 17:52:26+00:00,2024-07-19 04:06:41+00:00,205,37,3,No License
9998,neural networks,spinn,stanfordnlp,SPINN (Stack-augmented Parser-Interpreter Neur...,https://github.com/stanfordnlp/spinn,Python,2015-10-14 21:52:47+00:00,2024-08-25 00:23:01+00:00,205,86,1,MIT License


In [14]:
df= df.reset_index(drop=True)

In [15]:
df

Unnamed: 0,topics,Repository_Name,Owner,Description,URL,Programming_Language,Creation_Date,Last_Updated_Date,Number_of_Stars,Number_of_Forks,Number_of_Open_Issues,License_Type
0,machine learning,tensorflow,tensorflow,An Open Source Machine Learning Framework for ...,https://github.com/tensorflow/tensorflow,C++,2015-11-07 01:19:20+00:00,2024-10-08 11:18:23+00:00,185875,74234,5109,Apache License 2.0
1,machine learning,transformers,huggingface,🤗 Transformers: State-of-the-art Machine Learn...,https://github.com/huggingface/transformers,Python,2018-10-29 13:56:00+00:00,2024-10-08 11:20:21+00:00,133061,26554,1452,Apache License 2.0
2,machine learning,ML-For-Beginners,microsoft,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",https://github.com/microsoft/ML-For-Beginners,HTML,2021-03-03 01:34:05+00:00,2024-10-08 10:28:21+00:00,69346,14349,9,MIT License
3,machine learning,funNLP,fighting41love,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,https://github.com/fighting41love/funNLP,Python,2018-08-21 11:20:39+00:00,2024-10-08 11:07:47+00:00,68112,14442,28,No License
4,machine learning,awesome-machine-learning,josephmisiti,A curated list of awesome Machine Learning fra...,https://github.com/josephmisiti/awesome-machin...,Python,2014-07-15 19:11:19+00:00,2024-10-08 10:16:32+00:00,65624,14597,7,Other
...,...,...,...,...,...,...,...,...,...,...,...,...
8730,neural networks,Pytorch-Quaternion-Neural-Networks,Orkis-Research,This repository is an update to all previous r...,https://github.com/Orkis-Research/Pytorch-Quat...,Python,2018-09-28 09:03:15+00:00,2024-10-04 15:52:37+00:00,205,54,5,GNU General Public License v3.0
8731,neural networks,nea,nusnlp,Neural Essay Assessor: An Automated Essay Scor...,https://github.com/nusnlp/nea,Python,2016-09-26 05:07:42+00:00,2024-09-28 13:30:44+00:00,205,71,9,GNU General Public License v3.0
8732,neural networks,stylenet,machrisaa,Neural Network with Style Synthesis,https://github.com/machrisaa/stylenet,Python,2016-03-27 17:52:26+00:00,2024-07-19 04:06:41+00:00,205,37,3,No License
8733,neural networks,spinn,stanfordnlp,SPINN (Stack-augmented Parser-Interpreter Neur...,https://github.com/stanfordnlp/spinn,Python,2015-10-14 21:52:47+00:00,2024-08-25 00:23:01+00:00,205,86,1,MIT License


In [16]:
path=r"C:\Users\Kavitha\Desktop\CAPSTONE PROJECT\Github api\final.csv"
df.to_csv(path,index=False)

In [18]:
import mysql.connector

# Connect to the MySQL database
conn = mysql.connector.connect(
    host="localhost", 
    user="root", 
    password="kavi",
    database="Githubapi"
)

if conn.is_connected():
    print("Connected to MySQL database")
my_cursor = conn.cursor()

# # Create database if it doesn't exist
my_cursor.execute("CREATE DATABASE IF NOT EXISTS Githubapi")
# Create a cursor object


# Create table - Git
my_cursor.execute('''
    CREATE TABLE IF NOT EXISTS Git (
        ID INT AUTO_INCREMENT PRIMARY KEY,
        topics VARCHAR(225),
        Repository_Name VARCHAR(225), 
        Owner VARCHAR(225),
        Description LONGTEXT,
        URL TEXT, 
        Programming_Language VARCHAR(225),
        Creation_Date DATETIME,
        Last_Updated_Date DATETIME,
        Number_of_Stars INT,
        Number_of_Forks INT,
        Number_of_Open_Issues INT,
        License_Type VARCHAR(225)
    )
''')

print("Table created successfully")

# Commit the transaction
conn.commit()


Connected to MySQL database
Table created successfully


In [19]:
# Insert DataFrame into Git table (without ID column as it's AUTO_INCREMENT)
table_insert_query = '''INSERT INTO Git (topics, Repository_Name, Owner, Description, URL,
                                         Programming_Language, Creation_Date, Last_Updated_Date,
                                         Number_of_Stars, Number_of_Forks, Number_of_Open_Issues, License_Type)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''

# Convert DataFrame to list of tuples
data = df.values.tolist()

# Execute the insert query
my_cursor.executemany(table_insert_query, data)
conn.commit()

print("Values inserted successfully")


Values inserted successfully
