# Data Cleaning and Insertion into MSSQL

Importer Biblioteker og Opret Forbindelse til MSSQL

In [24]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float
from sqlalchemy import inspect

# Load the CSV data into a DataFrame
file_path = '2016_-_Cities_Emissions_Reduction_Targets_20240207.csv'
df = pd.read_csv(file_path)

# Display the initial data structure
print("Initial Data:")
print(df.head())
print(df.info())

# Handling Missing Values
df['Baseline emissions (metric tonnes CO2e)'] = df['Baseline emissions (metric tonnes CO2e)'].fillna(df['Baseline emissions (metric tonnes CO2e)'].median())
df['Percentage reduction target'] = df['Percentage reduction target'].fillna(df['Percentage reduction target'].median())

df['City Short Name'] = df['City Short Name'].fillna('Unknown City')
df['Country'] = df['Country'].fillna('Unknown Country')
df['Organisation'] = df['Organisation'].fillna('Unknown Organisation')

df['Reporting Year'] = pd.to_datetime(df['Reporting Year'], format='%Y', errors='coerce').dt.year
df['Target date'] = pd.to_datetime(df['Target date'], format='%Y', errors='coerce').dt.year

df['Reporting Year'] = df['Reporting Year'].fillna(0)
df['Target date'] = df['Target date'].fillna(0)

df['City Short Name'] = df['City Short Name'].str.title()
df.drop_duplicates(inplace=True)
df['Organisation'] = df['Organisation'].str.strip()
df['Country'] = df['Country'].replace({'Usa': 'USA', 'Uk': 'United Kingdom'})
df = df[df['Percentage reduction target'] >= 0]

# Step 10: Exporting Clean Data
cleaned_file_path = 'cleaned_data_MSSQL_2016_-_Cities_Emissions_Reduction_Targets_20240207.csv'
df.to_csv(cleaned_file_path, index=False)

# Define the connection parameters
server = 'JEFFREY'
database = 'EksamensProjekt_DB'
username = 'Oechsner'
password = '1234'
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_string)

# Define the metadata and table schema
metadata = MetaData()

# Define the table schema without setting Account No as the primary key
table = Table(
    'Cities_Emissions_Reduction_Targets_2016', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('Account No', Integer),
    Column('Organisation', String),
    Column('Country', String),
    Column('City Short Name', String),
    Column('C40', String),
    Column('Reporting Year', Integer),
    Column('Sector', String),
    Column('Target boundary', String),
    Column('Baseline year', String),
    Column('Baseline emissions (metric tonnes CO2e)', Float),
    Column('Percentage reduction target', Float),
    Column('Target date', Integer),
    Column('Comment', String),
    Column('City Location', String),
    Column('Country Location', String)
)

# Drop the table if it already exists to ensure we start fresh
inspector = inspect(engine)
if 'Cities_Emissions_Reduction_Targets_2016' in inspector.get_table_names():
    table.drop(engine)

# Create the table in the database
metadata.create_all(engine)

# Insert data into the table without 'id' column
df.to_sql('Cities_Emissions_Reduction_Targets_2016', engine, if_exists='append', index=False)

print("Data successfully inserted into the Cities_Emissions_Reduction_Targets_2016 table.")


Initial Data:
             Organisation  Account No  Country      City Short Name  C40  \
0           Odder Kommune       58796  Denmark        Odder Kommune  NaN   
1        Comune di Napoli       36158    Italy               Napoli  NaN   
2     Egedal Municipality       62855  Denmark  Egedal Municipality  NaN   
3            Yilan County       61753   Taiwan               Yilan   NaN   
4  City of Emeryville, CA       61790      USA       Emeryville, CA  NaN   

   Reporting Year Sector              Target boundary Baseline year  \
0            2016  Total                          NaN          2010   
1            2016  Total                          NaN          2005   
2            2016  Total                          NaN          2009   
3            2016  Total                          NaN          2009   
4            2016  Total  Overall community emissions          2004   

   Baseline emissions (metric tonnes CO2e)  Percentage reduction target  \
0                          

# 2016_-_Citywide_GHG_Emissions_20240207.csv

In [5]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float
from sqlalchemy import inspect

# Load the CSV data into a DataFrame
file_path = '2016_-_Citywide_GHG_Emissions_20240207.csv'
df = pd.read_csv(file_path)

# Clean column names
df.columns = [col.strip().replace('\u200b', '') for col in df.columns]

# Display the initial data structure
print("Initial Data:")
print(df.head())
print(df.info())

# Handling Missing Values
df['Total City-wide Emissions (metric tonnes CO2e)'] = df['Total City-wide Emissions (metric tonnes CO2e)'].fillna(df['Total City-wide Emissions (metric tonnes CO2e)'].median())
df['Total Scope 1 Emissions (metric tonnes CO2e)'] = df['Total Scope 1 Emissions (metric tonnes CO2e)'].fillna(df['Total Scope 1 Emissions (metric tonnes CO2e)'].median())
df['Total Scope 2 Emissions (metric tonnes CO2e)'] = df['Total Scope 2 Emissions (metric tonnes CO2e)'].fillna(df['Total Scope 2 Emissions (metric tonnes CO2e)'].median())

df['City Name'] = df['City Name'].fillna('Unknown City')
df['Country'] = df['Country'].fillna('Unknown Country')
df['Primary Methodology'] = df['Primary Methodology'].fillna('Unknown Methodology')

df['Reporting Year'] = pd.to_datetime(df['Reporting Year'], format='%Y', errors='coerce').dt.year
df['Reporting Year'] = df['Reporting Year'].fillna(0)

df['City Name'] = df['City Name'].str.title()
df.drop_duplicates(inplace=True)
df['City Name'] = df['City Name'].str.strip()
df['Country'] = df['Country'].replace({'Usa': 'USA', 'Uk': 'United Kingdom'})

# Step 10: Exporting Clean Data
cleaned_file_path = 'cleaned_data_MSSQL_2016_-_Citywide_GHG_Emissions_20240207.csv'
df.to_csv(cleaned_file_path, index=False)

# Define the connection parameters
server = 'JEFFREY'
database = 'EksamensProjekt_DB'
username = 'Oechsner'
password = '1234'
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_string)

# Define the metadata and table schema
metadata = MetaData()

# Define the table schema with an auto-incrementing primary key
table = Table(
    'Citywide_GHG_Emissions_2016', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('Account Number', Integer),
    Column('City Name', String),
    Column('Country', String),
    Column('City Short Name', String),
    Column('C40', String),
    Column('Reporting Year', Integer),
    Column('Measurement Year', String),  # Changed to String to match the data format in the CSV
    Column('Boundary', String),
    Column('Primary Methodology', String),
    Column('Methodology Details', String),
    Column('Gases included', String),
    Column('Total City-wide Emissions (metric tonnes CO2e)', Float),
    Column('Total Scope 1 Emissions (metric tonnes CO2e)', Float),
    Column('Total Scope 2 Emissions (metric tonnes CO2e)', Float),
    Column('Increase/Decrease from last year', String),
    Column('Reason for increase/decrease in emissions', String),
    Column('Current Population Year', Integer),
    Column('Current Population', Float),
    Column('City GDP', Float),
    Column('GDP Currency', String),
    Column('Year of GDP', Float),
    Column('GDP Source', String),
    Column('Average annual temperature (in Celsius)', Float),
    Column('Land area (in square km)', Float),
    Column('Average altitude (m)', Float),
    Column('City Location', String),
    Column('Country Location', String)
)

# Drop the table if it already exists to ensure we start fresh
inspector = inspect(engine)
if 'Citywide_GHG_Emissions_2016' in inspector.get_table_names():
    table.drop(engine)

# Create the table in the database
metadata.create_all(engine)

# Insert data into the table without 'id' column
df.to_sql('Citywide_GHG_Emissions_2016', engine, if_exists='append', index=False)

print("Data successfully inserted into the Citywide_GHG_Emissions_2016 table.")


Initial Data:
   Account Number            City Name         Country City Short Name  C40  \
0           35894    Ville de Montreal          Canada        Montreal  NaN   
1           35898   Greater Manchester  United Kingdom      Manchester  NaN   
2           54128         City of Reno             USA            Reno  NaN   
3           35879  City of Minneapolis             USA     Minneapolis  NaN   
4           50558   City of London, ON          Canada      London, ON  NaN   

   Reporting Year        Measurement Year  \
0            2016  12/31/2009 12:00:00 AM   
1            2016  12/31/2013 12:00:00 AM   
2            2016  12/31/2014 12:00:00 AM   
3            2016  12/31/2014 12:00:00 AM   
4            2016  12/31/2014 12:00:00 AM   

                                            Boundary  \
0  Other: The regional entity that constitutes th...   
1                                A metropolitan area   
2      Administrative boundary of a local government   
3      Administr

Primary Key: Added an auto-incrementing primary key column (id).
Data Cleaning: Filled missing values in key columns with median values or default strings.
Export Clean Data: Saved the cleaned DataFrame to a new CSV file.
Database Table Creation: Created a new table in SQL Server with the appropriate schema and inserted the cleaned data.

Explanation:
  Load CSV: Read the CSV file into a DataFrame using pd.read_csv   Display Initial Data: Print the first few rows and the structure of the DataFrame to understand its conten    Display Column Names: Print the column names to ensure correct column names are used in the subsequent ste
    Handle Missing Val      Fill numeric columns with median values to avoid skewing the        Fill categorical columns with a placeholder value to maintain consisy.
    Data Type Conversions: Convert date columns to a datetime format for consistency and easier manipuon.
    Standardize Formats: Standardize text formats (e.g., capitalizing city names) to maintain consncy.
    Remove Duplicates: Remove any duplicate rows to ensure each record iique.
    Trim and Clean Strings: Remove extra spaces from strialues.
    Consolidate Categories: Replace similar categories (e.g., 'Usa' to 'USA') for cstency.
    Validate Data: Ensure all emissions values are negative.
    Create Unique Identifiers: Create a unique identifier for each row based on the 'Account nu' column.
    Export Clean Data: Save the cleaned DataFrame to a CSV file.
    Define Connection Parameters: Set up the connection parame for MSSQL.
    Create and Insert Data: Write the cleaned data to a new table in the MSSQL database.

# 2017_-_Cities_Community_Wide_Emissions.csv

In [42]:
import pandas as pd

# Load the CSV data into a DataFrame
file_path = '2017_-_Cities_Community_Wide_Emissions.csv'
df = pd.read_csv(file_path)

# Clean column names: Strip leading/trailing spaces, remove hidden characters, and replace spaces with underscores
df.columns = [col.strip().replace('\u200b', '').replace(' ', '_') for col in df.columns]

# Display the initial data structure
print("Initial Data:")
print(df.head())
print(df.info())

# Handling Missing Values
numeric_columns = [
    'Total_emissions_(metric_tonnes_CO2e)', 'Total_Scope_1_Emissions_(metric_tonnes_CO2e)',
    'Total_Scope_2_Emissions_(metric_tonnes_CO2e)', 'GDP', 'GDP_Year', 'Average_annual_temperature_(in_Celsius)',
    'Average_altitude_(m)', 'Land_area_(in_square_km)', 'Population', 'Population_year'
]
for column in numeric_columns:
    if column in df.columns:
        df[column] = df[column].fillna(df[column].median())

categorical_columns = [
    'City', 'Country', 'Protocol', 'Boundary', 'Region', 'Access', 'Protocol_column',
    'Gases_included', 'Scopes_Included', 'Comment', 'Increase/Decrease_from_last_year',
    'Reason_for_increase/decrease_in_emissions', 'GDP_Currency', 'GDP_Source'
]
for column in categorical_columns:
    if column in df.columns:
        df[column] = df[column].fillna('Unknown')

# Convert 'Reporting_year' and 'Accounting_year' to datetime format and extract the year
date_columns = ['Reporting_year', 'Accounting_year']
for column in date_columns:
    if column in df.columns:
        df[column] = pd.to_datetime(df[column], errors='coerce').dt.year
        df[column] = df[column].fillna(0).astype(int)

# Standardizing text formats: Title case for city names and country names
if 'City' in df.columns:
    df['City'] = df['City'].str.title()
if 'Country' in df.columns:
    df['Country'] = df['Country'].str.title()

# Replacing specific values for consistency
if 'Country' in df.columns:
    df['Country'] = df['Country'].replace({'Usa': 'USA', 'Uk': 'United Kingdom'})

# Removing duplicates
df.drop_duplicates(inplace=True)

# Trimming and cleaning strings
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].str.strip()

# Exporting Clean Data
cleaned_file_path = '/mnt/data/cleaned_data_MSSQL_2017_-_Cities_Community_Wide_Emissions.csv'
df.to_csv(cleaned_file_path, index=False)

# Display the cleaned data structure
print("\nCleaned Data:")
print(df.head())
print(df.info())

print("\nMissing Values After Cleaning:")
print(df.isna().sum())


Initial Data:
   Account_number                     Organization                 City  \
0           49363  Nelson Mandela Bay Municipality  Nelson Mandela Bay    
1           31171           Ayuntamiento de Madrid               Madrid   
2            3417                    New York City        New York City   
3           59537               City of Denton, TX           Denton, TX   
4           35894                Ville de Montreal             Montreal   

        Country         Region  C40  Access  Reporting_year  \
0  South Africa         Africa  NaN  Public            2017   
1         Spain         Europe  C40  Public            2017   
2           USA  North America  C40  Public            2017   
3           USA  North America  NaN  Public            2017   
4        Canada  North America  C40  Public            2017   

           Accounting_year                                       Boundary  \
0  2013-07-01 - 2014-06-30                            A metropolitan area   
1 

  df[column] = pd.to_datetime(df[column], errors='coerce').dt.year
  df[column] = pd.to_datetime(df[column], errors='coerce').dt.year


AttributeError: Can only use .dt accessor with datetimelike values

In [39]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, Boolean, text

# Step 1: Load the CSV data into a DataFrame
file_path = '2017_-_Cities_Community_Wide_Emissions.csv'
df = pd.read_csv(file_path)

# Display the initial data structure
print("Initial Data:")
print(df.head())
print(df.info())

# Step 2: Clean column names
df.columns = df.columns.str.strip().str.replace('\u200b', '').str.replace(' ', '_')

# Rename columns to standardize the format
df.rename(columns={
    'Organization Number': 'Organization_Number',
    'Organization Name': 'Organization_Name',
    'Country/Area': 'Country_Area',
    'CDP Region': 'CDP_Region',
    'C40 City': 'C40_City',
    'GCoM City': 'GCoM_City',
    'Assessment attachment and/or direct link': 'Assessment_attachment_link',
    'Confirm attachment/link provided': 'Confirm_attachment_link_provided',
    'Boundary of assessment relative to jurisdiction boundary': 'Boundary_of_assessment',
    'Year of publication or approval': 'Year_of_publication',
    'Factors considered in assessment': 'Factors_considered_in_assessment',
    'Primary author(s) of assessment': 'Primary_authors_of_assessment',
    'Does the city have adaptation goal(s) and/or an adaptation plan?': 'Adaptation_goals_or_plan',
    'Population Year': 'Population_Year',
    'City Location': 'City_Location'
}, inplace=True)

# Step 3: Handle missing values
# Fill numeric columns with median values if they exist
numeric_columns = ['Year_of_publication', 'Population', 'Population_Year']

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Fill categorical columns with a placeholder value if they exist
categorical_columns = ['City', 'Country_Area', 'Organization_Name']

for col in categorical_columns:
    if col in df.columns:
        df[col] = df[col].fillna(f'Unknown_{col}')

# Convert date columns to datetime format and fill with a default value if they remain NaN
date_columns = ['Year_of_publication', 'Population_Year']

for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], format='%Y', errors='coerce').dt.year
        df[col] = df[col].fillna(0)

# Display the cleaned data structure
print("\nCleaned Data:")
print(df.head())
print(df.info())

# Define the connection parameters
server = 'JEFFREY'
database = 'EksamensProjekt_DB'
username = 'Oechsner'
password = '1234'
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_string)

# Create the engine
engine = create_engine(connection_string)

# Step 5: Define the table schema
metadata = MetaData()
table = Table('Cities_Community_Wide_Emissions_2017', metadata,
              Column('id', Integer, primary_key=True, autoincrement=True),
              Column('Questionnaire', String),
              Column('Organization_Number', Integer),
              Column('Organization_Name', String),
              Column('City', String),
              Column('Country_Area', String),
              Column('CDP_Region', String),
              Column('C40_City', Boolean),
              Column('GCoM_City', Boolean),
              Column('Access', String),
              Column('Assessment_attachment_link', String),
              Column('Confirm_attachment_link_provided', String),
              Column('Boundary_of_assessment', String),
              Column('Year_of_publication', Integer),
              Column('Factors_considered_in_assessment', String),
              Column('Primary_authors_of_assessment', String),
              Column('Adaptation_goals_or_plan', String),
              Column('Population', Integer),
              Column('Population_Year', Integer),
              Column('City_Location', String),
              Column('Last_update', String)
              )

# Drop the table if it exists to redefine the id column with identity
with engine.connect() as conn:
    conn.execute(text("IF OBJECT_ID('Cities_Community_Wide_Emissions_2017', 'U') IS NOT NULL DROP TABLE Cities_Community_Wide_Emissions_2017"))

# Create the new table
metadata.create_all(engine)

# Step 6: Insert data into the table
df.to_sql('Cities_Community_Wide_Emissions_2017', engine, if_exists='append', index=False)

print("Data successfully inserted into the Cities_Community_Wide_Emissions_2017 table.")


Initial Data:
   Account number                     Organization                 City  \
0           49363  Nelson Mandela Bay Municipality  Nelson Mandela Bay    
1           31171           Ayuntamiento de Madrid               Madrid   
2            3417                    New York City        New York City   
3           59537               City of Denton, TX           Denton, TX   
4           35894                Ville de Montreal             Montreal   

        Country         Region  C40  Access  Reporting year  \
0  South Africa         Africa  NaN  Public            2017   
1         Spain         Europe  C40  Public            2017   
2           USA  North America  C40  Public            2017   
3           USA  North America  NaN  Public            2017   
4        Canada  North America  C40  Public            2017   

           Accounting year                                       Boundary  \
0  2013-07-01 - 2014-06-30                            A metropolitan area   
1 

ProgrammingError: (pyodbc.ProgrammingError) ('42S22', "[42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Account_number'. (207) (SQLExecDirectW); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Organization'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Country'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Region'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'C40'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Reporting_year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Accounting_year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Boundary'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Protocol'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Protocol_column'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Gases_included'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Total_emissions_(metric_tonnes_CO2e)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Scopes_Included'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Total_Scope_1_Emissions_(metric_tonnes_CO2e)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Total_Scope_2_Emissions_(metric_tonnes_CO2e)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Comment'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Increase/Decrease_from_last_year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Reason_for_increase/decrease_in_emissions'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP_Currency'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP_Year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP_Source'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Average_annual_temperature_(in_Celsius)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Average_altitude_(m)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Land_area_(in_square_km)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Country_Location'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Account_number'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Organization'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Country'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Region'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'C40'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Reporting_year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Accounting_year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Boundary'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Protocol'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Protocol_column'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Gases_included'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Total_emissions_(metric_tonnes_CO2e)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Scopes_Included'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Total_Scope_1_Emissions_(metric_tonnes_CO2e)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Total_Scope_2_Emissions_(metric_tonnes_CO2e)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Comment'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Increase/Decrease_from_last_year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Reason_for_increase/decrease_in_emissions'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP_Currency'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP_Year'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'GDP_Source'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Average_annual_temperature_(in_Celsius)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Average_altitude_(m)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Land_area_(in_square_km)'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Country_Location'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Statement(s) could not be prepared. (8180)")
[SQL: INSERT INTO [Cities_Community_Wide_Emissions_2017] ([Account_number], [Organization], [City], [Country], [Region], [C40], [Access], [Reporting_year], [Accounting_year], [Boundary], [Protocol], [Protocol_column], [Gases_included], [Total_emissions_(me ... 6711 characters truncated ... ?, ?), (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: (49363, 'Nelson Mandela Bay Municipality', 'Nelson Mandela Bay ', 'South Africa', 'Africa', None, 'Public', 2017, '2013-07-01 - 2014-06-30', 'A metropolitan area', 'Global Protocol for Community-Scale Greenhouse Gas Emissions Inventories (GPC), (WRI, C40 and ICLEI)', None, None, 12232310.0, 'Total Scope 1, Scope 2 and Waste Scope 3 (Total BASIC emissions)', 2249502.0, 9982808.0, None, 'Increased', 'We are gradually improving the accuracy of our data collection', 1152115, 2011, 52147.0, 'ZAR     South African Rand', 2010.0, 'Built Environment Performance Plan(BEPP)', 17.5, 69.0, 1950.0, '(-33.745241, 25.568108)', '(-30.559482, 22.937506)', 31171, 'Ayuntamiento de Madrid', 'Madrid', 'Spain', 'Europe', 'C40', 'Public', 2017, '2014-01-01 - 2014-12-31', 'Administrative boundary of a local government', 'Global Protocol for Community-Scale Greenhouse Gas Emissions Inventories (GPC), (WRI, C40 and ICLEI)', 'Adaptation from Madrid Air Quality Inventory elaborated under the frame of EMEP CORINAIR METHOLOGY', None, 9236196.0, 'Total Scope 1, Scope 2 and Waste Scope 3 (Total BASIC emissions)', 7069783.0, 3408424.0, None, 'Decreased' ... 1977 parameters truncated ... 'CO2; CH4; N2O; HFCs', 348347.0, 'Total Scope 1 and Scope 2', 348437.0, None, None, 'Decreased', 'Lower carbon emission electricity energy mix (grid).', 65600, 2010, None, 'USD     US Dollar', 2016.0, None, 23.0, 15.0, 25.0, '(38.544907, -121.740517)', '(37.09024, -95.712891)', 36037, 'Santiago de Cali', 'Santiago de Cali', 'Colombia', 'Latin America', None, 'Public', 2017, '2010-01-01 - 2010-12-31', 'Combination of administrative divisions', '2006 IPCC Guidelines for National Greenhouse Gas Inventories', None, 'CO2; CH4; N2O', 4174608.0, 'Total Scope 1 and Scope 2', None, None, None, None, None, 2369829, 2015, 4540.0, None, 2012.0, 'DAPM', 24.7, 995.0, 566.0, '(3.451647, -76.531985)', '(4.570868, -74.297333)')]
(Background on this error at: https://sqlalche.me/e/20/f405)

Explanation:

Load CSV: Read the CSV file into a DataFrame using pd.read_csv.
Display Initial Data: Print the first few rows and the structure of the DataFrame to understand its content.
Display Column Names: Print the column names to ensure correct column names are used in the subsequent steps.
Handle Missing Values:
Fill numeric columns with median values to avoid skewing the data.
Fill categorical columns with a placeholder value to maintain consistency.
Data Type Conversions: Convert date columns to a datetime format for consistency and easier manipulation.
Standardize Formats: Standardize text formats (e.g., capitalizing city names) to maintain consistency.
Remove Duplicates: Remove any duplicate rows to ensure each record is unique.
Trim and Clean Strings: Remove extra spaces from string values.
Consolidate Categories: Replace similar categories (e.g., 'Usa' to 'USA') for consistency.
Validate Data: Ensure all emissions values are non-negative.
Create Unique Identifiers: Create a unique identifier for each row based on the 'Account number' column.
Export Clean Data: Save the cleaned DataFrame to a new CSV file.
Define Connection Parameters: Set up the connection parameters for MSSQL.
Create and Insert Data: Write the cleaned data to a new table in the MSSQL database.

# 2017_-_Cities_Emissions_Reduction_Targets_20240207.csv

In [37]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, text

# Step 1: Load the CSV data into a DataFrame
file_path = '2017_-_Cities_Emissions_Reduction_Targets_20240207.csv'
df = pd.read_csv(file_path)

# Display the initial data structure
print("Initial Data:")
print(df.head())
print(df.info())

# Clean column names: remove leading/trailing spaces, hidden characters, and replace spaces with underscores
df.columns = df.columns.str.strip().str.replace('\u200b', '').str.replace(' ', '_')

# Display the cleaned column names
print("\nCleaned Column Names:")
print(df.columns)

# Step 2: Handling Missing Values
print("\nMissing Values Before Cleaning:")
print(df.isna().sum())

# Fill numeric columns with median values
df['Baseline_emissions_(metric_tonnes_CO2e)'] = df['Baseline_emissions_(metric_tonnes_CO2e)'].fillna(df['Baseline_emissions_(metric_tonnes_CO2e)'].median())
df['Percentage_reduction_target'] = df['Percentage_reduction_target'].fillna(df['Percentage_reduction_target'].median())

# Fill categorical columns with a placeholder value
df['City'] = df['City'].fillna('Unknown_City')
df['Country'] = df['Country'].fillna('Unknown_Country')
df['Organisation'] = df['Organisation'].fillna('Unknown_Organisation')

# Convert 'Reporting_year' and other date columns to integer format and fill with a default value if they remain NaN
df['Reporting_year'] = pd.to_datetime(df['Reporting_year'], format='%Y', errors='coerce').dt.year
df['Reporting_year'] = df['Reporting_year'].fillna(0).astype(int)
df['Target_date'] = pd.to_datetime(df['Target_date'], format='%Y', errors='coerce').dt.year
df['Target_date'] = df['Target_date'].fillna(0).astype(int)

# Step 3: Data Type Conversions
df['Reporting_year'] = df['Reporting_year'].astype(int)
df['Target_date'] = df['Target_date'].astype(int)

# Step 4: Standardizing Formats
df['City'] = df['City'].str.title()

# Step 5: Removing Duplicates
df.drop_duplicates(inplace=True)

# Step 6: Trimming and Cleaning Strings
df['Organisation'] = df['Organisation'].str.strip()

# Step 7: Consolidating Categories
df['Country'] = df['Country'].replace({'Usa': 'USA', 'Uk': 'United_Kingdom'})

# Step 8: Validating Data
# Ensure all reduction targets are non-negative
df = df[df['Percentage_reduction_target'] >= 0]

# Step 9: Creating Unique Identifiers
df['id'] = pd.factorize(df['Account_No'])[0] + 1

# Rename columns to match SQL table definition
df.rename(columns={
    'Baseline_emissions_(metric_tonnes_CO2e)': 'Baseline_emissions_metric_tonnes_CO2e',
    'Estimated_business_as_usual_absolute_emissions_in_target_year_(metric_tonnes_CO2e)': 'Estimated_business_as_usual_absolute_emissions_in_target_year_metric_tonnes_CO2e',
    'Intensity_unit_(emissions_per)': 'Intensity_unit_emissions_per'
}, inplace=True)

# Step 10: Exporting Clean Data
cleaned_file_path = 'cleaned_data_MSSQL_2017_-_Cities_Emissions_Reduction_Targets.csv'
df.to_csv(cleaned_file_path, index=False)

# Display the cleaned data structure and info
print("\nCleaned Data:")
print(df.head())
print(df.info())

print("\nMissing Values After Cleaning:")
print(df.isna().sum())

# Step 11: Define the connection parameters
server = 'JEFFREY'
database = 'EksamensProjekt_DB'
username = 'Oechsner'
password = '1234'

# Create the connection string
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'

# Create the engine
engine = create_engine(connection_string)

# Step 12: Create a new table and insert data
metadata = MetaData()

# Define the table schema with an auto-incrementing primary key
table = Table(
    'Cities_Emissions_Reduction_Targets_2017', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('Account_No', Integer),
    Column('Organisation', String),
    Column('City', String),
    Column('Country', String),
    Column('Region', String),
    Column('Access', String),
    Column('C40', String),
    Column('Reporting_year', Integer),
    Column('Type_of_target', String),
    Column('Sector', String),
    Column('Baseline_year', Integer),
    Column('Baseline_emissions_metric_tonnes_CO2e', Float),
    Column('Percentage_reduction_target', Float),
    Column('Target_date', Integer),
    Column('Estimated_business_as_usual_absolute_emissions_in_target_year_metric_tonnes_CO2e', Float),
    Column('Intensity_unit_emissions_per', String),
    Column('Comment', String),
    Column('Population', Float),
    Column('Population_Year', Float),
    Column('City_Location', String),
    Column('Country_Location', String)
)

# Drop the table if it already exists to ensure we start fresh
with engine.connect() as conn:
    conn.execute(text("IF OBJECT_ID('Cities_Emissions_Reduction_Targets_2017', 'U') IS NOT NULL DROP TABLE Cities_Emissions_Reduction_Targets_2017"))

# Create the table in the database
metadata.create_all(engine)

# Insert data into the table
df.to_sql('Cities_Emissions_Reduction_Targets_2017', engine, if_exists='append', index=False)

print("Data successfully inserted into the Cities_Emissions_Reduction_Targets_2017 table.")


Initial Data:
   Account No                   Organisation              City  \
0       54408                 Aarhus Kommune            Aarhus   
1       63616  Abasan Al-Kabira Municipality  Abasan Al-Kabira   
2       63616  Abasan Al-Kabira Municipality  Abasan Al-Kabira   
3        1499        Ajuntament de Barcelona         Barcelona   
4        1499        Ajuntament de Barcelona         Barcelona   

              Country               Region  Access  C40  Reporting year  \
0             Denmark               Europe  Public  NaN            2017   
1  State of Palestine  South and West Asia  Public  NaN            2017   
2  State of Palestine  South and West Asia  Public  NaN            2017   
3               Spain               Europe  Public  C40            2017   
4               Spain               Europe  Public  C40            2017   

    Type of target     Sector  ... Baseline emissions (metric tonnes CO2e)  \
0  Absolute target        NaN  ...                          

IntegrityError: (pyodbc.IntegrityError) ('23000', "[23000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Cannot insert explicit value for identity column in table 'Cities_Emissions_Reduction_Targets_2017' when IDENTITY_INSERT is set to OFF. (544) (SQLExecDirectW)")
[SQL: INSERT INTO [Cities_Emissions_Reduction_Targets_2017] ([Account_No], [Organisation], [City], [Country], [Region], [Access], [C40], [Reporting_year], [Type_of_target], [Sector], [Baseline_year], [Baseline_emissions_metric_tonnes_CO2e], [Percentage_red ... 6597 characters truncated ... ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: (54408, 'Aarhus Kommune', 'Aarhus', 'Denmark', 'Europe', 'Public', None, 2017, 'Absolute target', None, '2008', 1484767.0, 100.0, 2030, None, None, None, 336000.0, 2017.0, '(56.168393, 10.137373)', '(56.26392, 9.501785)', 1, 63616, 'Abasan Al-Kabira Municipality', 'Abasan Al-Kabira', 'State of Palestine', 'South and West Asia', 'Public', None, 2017, 'Absolute target', 'Buildings', '2010', 18320.0, 19.0, 2020, None, None, None, 30000.0, 2015.0, '(31.323126, 34.344025)', '(31.9522, 35.2332)', 2, 63616, 'Abasan Al-Kabira Municipality', 'Abasan Al-Kabira', 'State of Palestine', 'South and West Asia', 'Public' ... 1990 parameters truncated ... None, 133358.0, 2015.0, '(34.0007, -81.0348)', '(37.09024, -95.712891)', 52, 43910, 'City of Columbus', 'Columbus', 'USA', 'North America', 'Public', None, 2017, 'Absolute target', 'Total', '2013', 240971.0, 20.0, 2020, None, None, None, 850106.0, 2016.0, '(39.9611755, -82.9987942)', '(37.09024, -95.712891)', 53, 31009, 'City of Copenhagen', 'Copenhagen', 'Denmark', 'Europe', 'Public', 'C40', 2017, 'Baseline scenario (business as usual) target', 'Total', None, 1484767.0, 30.0, 2025, 900000.0, None, None, 602504.0, 2016.0, '(55.6760968, 12.5683371)', '(56.26392, 9.501785)', 54)]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

Explanation:
  Load CSV: Read the CSV file into a DataFrame using pd.read_csv   Display Initial Data: Print the first few rows and the structure of the DataFrame to understand its conten    Display Column Names: Print the column names to ensure correct column names are used in the subsequent ste
    Handle Missing Val      Fill numeric columns with median values to avoid skewing the        Fill categorical columns with a placeholder value to maintain consisy.
    Data Type Conversions: Convert date columns to a datetime format for consistency and easier manipuon.
    Standardize Formats: Standardize text formats (e.g., capitalizing city names) to maintain consncy.
    Remove Duplicates: Remove any duplicate rows to ensure each record iique.
    Trim and Clean Strings: Remove extra spaces from strialues.
    Consolidate Categories: Replace similar categories (e.g., 'Usa' to 'USA') for cstency.
    Validate Data: Ensure all percentage reduction targets are negative.
    Create Unique Identifiers: Create a unique identifier for each row based on the 'Accoun' column.
    Export Clean Data: Save the cleaned DataFrame to a CSV file.
    Define Connection Parameters: Set up the connection parame for MSSQL.
    Create and Insert Data: Write the cleaned data to a new table in the MSSQL database.

# 2023_Cities_Climate_Risk_and_Vulnerability_Assessments_20240207.csv

In [9]:
import pandas as pd
from sqlalchemy import create_engine

# Step 1: Load the CSV data into a DataFrame
file_path = '2023_Cities_Climate_Risk_and_Vulnerability_Assessments_20240207.csv'
df = pd.read_csv(file_path)

# Display the initial data structure
print("Initial Data:")
print(df.head())
print(df.info())

# Display the column names to identify the correct column names
print("\nColumn Names:")
print(df.columns)

# Step 2: Handling Missing Values
print("\nMissing Values Before Cleaning:")
print(df.isna().sum())

# Fill numeric columns with median values if they exist
numeric_columns = ['Population', 'Population Year', 'Year of publication or approval']
for column in numeric_columns:
    if column in df.columns:
        df[column] = df[column].fillna(df[column].median())

# Fill categorical columns with a placeholder value if they exist
categorical_columns = ['City', 'Country/Area', 'Organization Name', 'Assessment attachment and/or direct link']
for column in categorical_columns:
    if column in df.columns:
        df[column] = df[column].fillna(f'Unknown {column}')

# Convert 'Year of publication or approval' to datetime format and fill with a default value if they remain NaN
if 'Year of publication or approval' in df.columns:
    df['Year of publication or approval'] = pd.to_datetime(df['Year of publication or approval'], format='%Y', errors='coerce').dt.year
    df['Year of publication or approval'] = df['Year of publication or approval'].fillna(0)

# Step 3: Data Type Conversions
# Convert date columns to datetime format for consistency
if 'Year of publication or approval' in df.columns:
    df['Year of publication or approval'] = pd.to_datetime(df['Year of publication or approval'], format='%Y', errors='coerce').dt.year

# Step 4: Standardizing Formats
if 'City' in df.columns:
    df['City'] = df['City'].str.title()

# Step 5: Removing Duplicates
df.drop_duplicates(inplace=True)

# Step 6: Trimming and Cleaning Strings
if 'Organization Name' in df.columns:
    df['Organization Name'] = df['Organization Name'].str.strip()

# Step 7: Consolidating Categories
if 'Country/Area' in df.columns:
    df['Country/Area'] = df['Country/Area'].replace({'Usa': 'USA', 'Uk': 'United Kingdom'})

# Step 8: Validating Data
# Ensure all numeric columns have non-negative values
for column in numeric_columns:
    if column in df.columns:
        df = df[df[column] >= 0]

# Step 9: Creating Unique Identifiers
if 'Organization Number' in df.columns:
    df['id'] = pd.factorize(df['Organization Number'])[0] + 1

# Step 10: Exporting Clean Data
cleaned_file_path = 'cleaned_data_MSSQL_2023_Cities_Climate_Risk_and_Vulnerability_Assessments.csv'
df.to_csv(cleaned_file_path, index=False)

# Display the cleaned data structure and info
print("\nCleaned Data:")
print(df.head())
print(df.info())

print("\nMissing Values After Cleaning:")
print(df.isna().sum())

# Step 11: Define the connection parameters
server = 'JEFFREY'
database = 'EksamensProjekt_DB'
username = 'Oechsner'
password = '1234'

# Create the connection string
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'

# Create the engine
engine = create_engine(connection_string)

# Step 12: Create a new table and insert data
table_name = 'Cities_Climate_Risk_and_Vulnerability_Assessments'

# Write the data to the SQL table
df.to_sql(table_name, engine, if_exists='replace', index=False)

print(f"Data successfully inserted into the {table_name} table.")


Initial Data:
  Questionnaire  Organization Number                Organization Name  \
0   Cities 2023               840926      Prefeitura de Serra Talhada   
1   Cities 2023                51075                 City of Shenzhen   
2   Cities 2023               863190                            Renca   
3   Cities 2023               930366  Municipalidad Distrital de Yura   
4   Cities 2023                60236          Trelleborg Municipality   

         City Country/Area     CDP Region  C40 City  GCoM City  Access  \
0         NaN       Brazil  Latin America     False       True  public   
1    Shenzhen        China      East Asia      True      False  public   
2         NaN        Chile  Latin America     False      False  public   
3         NaN         Peru  Latin America     False       True  public   
4  Trelleborg       Sweden         Europe     False       True  public   

            Assessment attachment and/or direct link  \
0  https://drive.google.com/file/d/19DMxxK532I

Explanation:
  Load CSV: Read the CSV file into a DataFrame using pd.read_csv   Display Initial Data: Print the first few rows and the structure of the DataFrame to understand its conten    Display Column Names: Print the column names to ensure correct column names are used in the subsequent ste
    Handle Missing Val      Fill numeric columns with median values to avoid skewing the        Fill categorical columns with a placeholder value to maintain consisy.
    Data Type Conversions: Convert date columns to a datetime format for consistency and easier manipuon.
    Standardize Formats: Standardize text formats (e.g., capitalizing city names) to maintain consncy.
    Remove Duplicates: Remove any duplicate rows to ensure each record iique.
    Trim and Clean Strings: Remove extra spaces from strialues.
    Consolidate Categories: Replace similar categories (e.g., 'Usa' to 'USA') for cstency.
    Validate Data: Ensure all numeric columns have non-nega values.
    Create Unique Identifiers: Create a unique identifier for each row based on the 'Organization Nu' column.
    Export Clean Data: Save the cleaned DataFrame to aw CSV file.
    Define Connection Parameters: Set up the connection parame for MSSQL.
    Create and Insert Data: Write the cleaned data to a new table in the MSSQL database.