In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import re

In [107]:
df = pd.read_csv(r'C:\Users\ADMIN\sdsdatathon\data\champions_group_data.csv', low_memory=False)

In [108]:
# Create map for State abbreviations

df['State'] = df['State'].str.strip()
df['State Or Province Abbreviation'] = df['State Or Province Abbreviation'].str.strip()

# Create a mapping from rows where both exist
state_map = df.dropna(subset=['State', 'State Or Province Abbreviation']) \
              .drop_duplicates('State') \
              .set_index('State')['State Or Province Abbreviation'].to_dict()

# Fill missing abbreviations using the map
df['State Or Province Abbreviation'] = df['State Or Province Abbreviation'].fillna(df['State'].map(state_map))

# Create map for Parent State abbreviations

df['Parent State/Province'] = df['Parent State/Province'].str.strip()
df['Parent State/Province Abbreviation'] = df['Parent State/Province Abbreviation'].str.strip()

# Create a mapping from rows where both exist
state_map = df.dropna(subset=['Parent State/Province', 'Parent State/Province Abbreviation']) \
              .drop_duplicates('Parent State/Province') \
              .set_index('Parent State/Province')['Parent State/Province Abbreviation'].to_dict()

# Fill missing abbreviations using the map
df['Parent State/Province Abbreviation'] = df['Parent State/Province Abbreviation'].fillna(df['Parent State/Province'].map(state_map))

# Create map for Global State abbreviations

df['Global Ultimate State/Province'] = df['Global Ultimate State/Province'].str.strip()
df['Ultimate State/Province Abbreviation'] = df['Ultimate State/Province Abbreviation'].str.strip()

# Create a mapping from rows where both exist
state_map = df.dropna(subset=['Global Ultimate State/Province', 'Ultimate State/Province Abbreviation']) \
              .drop_duplicates('Global Ultimate State/Province') \
              .set_index('Global Ultimate State/Province')['Ultimate State/Province Abbreviation'].to_dict()

# Fill missing abbreviations using the map
df['Ultimate State/Province Abbreviation'] = df['Ultimate State/Province Abbreviation'].fillna(df['Global Ultimate State/Province'].map(state_map))

# Create map for Domestic State abbreviations

df['Domestic Ultimate State/Province Name'] = df['Domestic Ultimate State/Province Name'].str.strip()
df['Domestic Ultimate State Abbreviation'] = df['Domestic Ultimate State Abbreviation'].str.strip()

# Create a mapping from rows where both exist
state_map = df.dropna(subset=['Domestic Ultimate State/Province Name', 'Domestic Ultimate State Abbreviation']) \
              .drop_duplicates('Domestic Ultimate State/Province Name') \
              .set_index('Domestic Ultimate State/Province Name')['Domestic Ultimate State Abbreviation'].to_dict()

# Fill missing abbreviations using the map
df['Domestic Ultimate State Abbreviation'] = df['Domestic Ultimate State Abbreviation'].fillna(df['Domestic Ultimate State/Province Name'].map(state_map))

In [109]:
def parse_range_to_mean(value):
    if pd.isna(value): return np.nan
    nums = re.findall(r'\d+', str(value))
    if not nums: return np.nan
    return sum(int(n) for n in nums) / len(nums)

hardware_cols = ['No. of PC', 'No. of Desktops', 'No. of Laptops', 'No. of Routers', 'No. of Servers', 'No. of Storage Devices']
for col in hardware_cols:
    if col in df.columns:
        df[f"{col}"] = df[col].apply(parse_range_to_mean)

In [114]:
# Find all object-type columns
object_columns = df.select_dtypes(include=['object']).columns

# Fill NaN values in object-type columns with 'Unknown'
df[object_columns] = df[object_columns].fillna('Unknown')

for col in object_columns:
    df[col] = df[col].str.strip().str.title()

abbreviation_cols = ['State Or Province Abbreviation','Parent State/Province Abbreviation',
                     'Ultimate State/Province Abbreviation','Domestic Ultimate State Abbreviation']

for col in abbreviation_cols:
    df[col] = df[col].str.upper()

In [115]:
cols = ['DUNS Number ','Postal Code','Phone Number','SIC Code','8-Digit SIC Code','NAICS Code','NACE Rev 2 Code','Ticker',
        'Year Found','Lattitude','Longitude','Parent Postal Code','Global Ultimate Postal Code','Domestic Ultimate Postal Code',
        'Registration Number','Is Headquarters','Is Domestic Ultimate','ANZSIC Code','ISIC Rev 4 Code']

df[cols] = df[cols].fillna('Unknown')

In [116]:
# Find all integer-type columns
integer_columns = df.select_dtypes(include=['int','float']).columns

# Fill NaN values in integer-type columns with 0
df[integer_columns] = df[integer_columns].fillna(0)

In [None]:
# df.to_csv('cleaned_champions_data.csv', index=False)