All cells can/must be run.

In [38]:
%pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
"""Dependencies import."""

import pandas as pd # For CSV manipulation and merging
import sqlite3      # For embedded SQL manipulation (fuck SQL workbench)
import os           # For files interaction

print("Dependencies imported successfully.")

Dependencies imported successfully.


In [None]:
"""Files loading, Pandas DF parsing and inspection."""

# File paths
sp500_file = 'Database_ressources/sp_500_companies_with_financial_information.csv'
marketcap_file = 'Database_ressources/top_global_companies_by_market_cap.csv'
db_file = 'companies_database.db'
table_name = 'companies'

# Pandas DF load and inspection
try:
    df_sp500 = pd.read_csv(sp500_file)
    df_marketcap = pd.read_csv(marketcap_file)
    
    print("CSV files loaded successfully.")
    
    # print("\nS&P 500 info:")
    # df_sp500.info()
    # print(df_sp500.head())
    # print("\nMarket Cap info:")
    # df_marketcap.info()
    # print(df_marketcap.head())

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please ensure the file paths are correct and the zip extraction worked.")
except Exception as e:
    print(f"An error occurred: {e}")

CSV files loaded successfully.


In [27]:
"""DF columns renaming."""

# S&P 500 DF columns renaming
df1 = df_sp500[['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry', 'Founded']].copy()
df1.rename(columns={
    'GICS Sector': 'Sector',
    'GICS Sub-Industry': 'Industry'
}, inplace=True)
df1['Founded'] = df1['Founded'].astype(str).str.extract(r'(\d{4})', expand=False)
# print("Selected and renamed S&P 500 columns:")
# print(df1.head())

print("S&P 500 DF columns renamed successfully.")

# Top Global Market Cap DF columns renaming
df2 = df_marketcap[['Company Code', 'Marketcap', 'Stock Price', 'Country']].copy()
df2.rename(columns={
    'Company Code': 'Symbol', # IMPORTANT: Matching column for merge
    'Stock Price': 'Stockprice'
}, inplace=True)
# print("Selected and renamed Top Global Market Cap columns:")
# print(df2.head())

print("Top Global Market Cap DF columns renamed successfully.")

S&P 500 DF columns renamed successfully.
Top Global Market Cap DF columns renamed successfully.


In [30]:
"""Top Global Market Cap DF datas cleaning."""

# Top Global Market Cap DF cleaning (handle $, T, B, M, commas)
def clean_marketcap(value):
    if isinstance(value, (int, float)):
        return value
    if not isinstance(value, str):
        return None
    value = value.replace('$', '').replace(',', '').strip()
    if 'T' in value:
        # Handle potential spaces like '3.033 T'
        return float(value.replace('T', '').strip()) * 1e12
    elif 'B' in value:
        return float(value.replace('B', '').strip()) * 1e9
    elif 'M' in value:
        return float(value.replace('M', '').strip()) * 1e6
    try:
        # Attempt direct conversion after basic cleaning
        return float(value)
    except ValueError:
        return None # Return None if conversion still fails

df2['Marketcap'] = df2['Marketcap'].apply(clean_marketcap)

# Stockprice cleaning (handle $, commas)
def clean_stockprice(value):
    if isinstance(value, (int, float)):
        return value
    if not isinstance(value, str):
        return None
    # Remove '$' and ',' before converting
    value = value.replace('$', '').replace(',', '').strip()
    try:
        return float(value)
    except ValueError:
        return None # Return None if conversion fails

df2['Stockprice'] = df2['Stockprice'].apply(clean_stockprice)

# print("\nCleaned Market Cap DF (showing Symbol, Marketcap, Stockprice, Country):")
# print(df2.head())
# print("\nData types after cleaning:")
# df2.info()

print("Top Global Market Cap DF datas cleaned successfully.")


Top Global Market Cap DF datas cleaned successfully.


In [33]:
"""DataFrames merging."""

# Merge based on the 'Symbol' column.
# 'inner' merge keeps only symbols present in BOTH DataFrames.
merged_df = pd.merge(df1, df2, on='Symbol', how='inner')

# Check for duplicates in the merging key ('Symbol') before merge if issues arise
# print("Duplicates in df1 Symbol:", df1.duplicated('Symbol').sum())
# print("Duplicates in df2 Symbol:", df2.duplicated('Symbol').sum())

# Handle potential duplicates after merge if needed (e.g., based on Marketcap)
merged_df = merged_df.sort_values('Marketcap', ascending=False).drop_duplicates('Symbol', keep='first')

# print(f"Merged DataFrame contains {len(merged_df)} rows.")
# print("Merged DataFrame head:")
# print(merged_df.head())
# merged_df.info()

print("DF merged successfully.")

DF merged successfully.


In [35]:
"""DF preparation for SQL."""

# Define the final order of columns matching the target schema
final_columns = ['Symbol', 'Security', 'Sector', 'Industry', 'Founded', 'Marketcap', 'Stockprice', 'Country']

# Ensure all required columns exist and select them in the correct order
# Check if all columns are present (should be after merge)
missing_cols = [col for col in final_columns if col not in merged_df.columns]
if missing_cols:
    print(f"Warning: The following columns are missing from the merged DataFrame: {missing_cols}")
    # Handle missing columns if necessary (e.g., add them with None)
    # for col in missing_cols:
    #     merged_df[col] = None

# Select and reorder columns
final_df = merged_df[final_columns].copy() # Use .copy() to avoid SettingWithCopyWarning on potential future modifications

# print("Final DataFrame structure for SQL:")
# print(final_df.head())
# final_df.info()

print("DF preparation for SQL successful.")

DF preparation for SQL successful.


In [37]:
"""SQLite database creation."""

# Connection to SQLite Database (creates the file if it doesn't exist)
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
print(f"Connected to SQLite database: {db_file}")

# Use pandas to_sql to create table and insert data
# if_exists='replace': Drops table if exists, then creates new and inserts. Good for reruns.
# if_exists='append': Adds data to existing table.
# if_exists='fail': Raises error if table exists.
try:
    final_df.to_sql(table_name, conn, if_exists='replace', index=False)
    print(f"Data successfully imported into table '{table_name}' in database '{db_file}'")

    # Verify by reading back some data
    print("\nVerifying import - First 5 rows from SQL database:")
    verify_df = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 5", conn)
    print(verify_df)

except sqlite3.Error as e:
    print(f"SQLite error during import: {e}")
except Exception as e:
    print(f"An unexpected error occurred during SQL import: {e}")
finally:
    # Commit changes and close connection regardless of success/failure
    if conn:
        conn.commit()
        conn.close()
        print("Database connection closed.")

Connected to SQLite database: companies_database.db
Data successfully imported into table 'companies' in database 'companies_database.db'

Verifying import - First 5 rows from SQL database:
  Symbol                 Security                  Sector  \
0   MSFT                Microsoft  Information Technology   
1   AAPL               Apple Inc.  Information Technology   
2   GOOG  Alphabet Inc. (Class C)  Communication Services   
3   AMZN                   Amazon  Consumer Discretionary   
4   NVDA                   Nvidia  Information Technology   

                                     Industry Founded     Marketcap  \
0                            Systems Software    1975  3.033000e+12   
1  Technology Hardware, Storage & Peripherals    1977  2.951000e+12   
2                Interactive Media & Services    1998  1.909000e+12   
3                            Broadline Retail    1994  1.653000e+12   
4                              Semiconductors    1993  1.522000e+12   

   Stockprice Co