Loading the dataset csv's into data frames and printing the first 10 values in the data set for each data frame to ensure that the data is being read and stored correctly

In [None]:
import pandas as pd

dataset_1 = pd.read_csv("CO2 emission by countries.csv")
dataset_2 = pd.read_csv("Historical Tropical Storm.csv")
dataset_3 = pd.read_csv("global_temps.csv")

print("From Dataset 1: CO2 emission by countries")
print(dataset_1.head(10))
print("\n")

print("From Dataset 2: Historical Tropical Storm")
print(dataset_2.head(10))
print("\n")

print("From Dataset 3: global_temps")
print(dataset_3.head(10))
print("\n")

The "CO2 emission by countries" dataset currently has these attributes: Country", "Code", "Calling Code", "Year", "CO2 emission (Tons)", "Population(2022)", "Area", "% of World" and "Density(km2)" but from this dataset we only require "Year" and "CO2 emission (Tons)" so the others will be removed to reduce the size of the final dataset

In [None]:
#Removing unnecessary columns from the datasets "CO2 emission by countries"
dataset_1.columns = dataset_1.columns.str.strip()
dropping = ["Country", "Code", "Calling Code", "Population(2022)", "Area", "% of World", "Density(km2)"]
dataset_1.drop(columns=dropping, inplace= True, errors= "ignore")

#updating the csv
dataset_1.to_csv("CO2 emission by countries.csv", index= False)
print("CO2 emission by countries dataset, has been updated")

The "Historical Tropical Storm" dataset currently has these attributes: "FID", "YEAR", "MONTH", "DAY", "AD_TIME", "BTID", "NAME", "LAT", "LONG", "WIND_KTS", "PRESSURE", "CAT", "BASIN" and "Shape_Leng" but from this dataset we only require "YEAR", "MONTH", "DAY", "LAT", "LONG", "WIND_KTS", "PRESSURE", "CAT" and "Shape_Leng" so the others will be removed to reduce the size of the final dataset

In [None]:
#Removing unnecessary columns from the dataset "Historical Tropical Storm"
dataset_2.columns = dataset_2.columns.str.strip()
dropping_2 = ["FID", "AD_TIME", "BTID", "NAME", "BASIN"]
dataset_2.drop(columns=dropping_2, inplace= True, errors= "ignore")

#updating the csv
dataset_2.to_csv("Historical Tropical Storm.csv", index= False)
print("Historical Tropical Storm dataset, has been updated")

The "global_temps" dataset currently has these attributes: "Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "J-D", "D-N", "DJF", "MAM", "JJA" and "SON" but from this dataset we only require "Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov"and "Dec" so the others will be removed to reduce the size of the final dataset

In [None]:
#Removing unnecessary columns from the dataset "global_temps"
dataset_3.columns = dataset_3.columns.str.strip()
dropping_3 = ["J-D", "D-N", "DJF", "MAM", "JJA", "SON"]
dataset_3.drop(columns=dropping_3, inplace= True, errors= "ignore")

#updating the csv
dataset_3.to_csv("global_temps.csv", index= False)
print("global_temps dataset, has been updated")

Cleaning the data by replacing nan values with approprate values for the specific column, by first identifying where the nan values are in the datasets

In [None]:
#Identify the number of missing values in the datasets
print("Missing values from Dataset 1: CO2 emission by countries")
print(dataset_1.isnull().sum())
print("\n")

print("Missing values from Dataset 2: Historical Tropical Storm")
print(dataset_2.isnull().sum())
print("\n")

print("Missing values from Dataset 3: global_temps")
print(dataset_3.isnull().sum())
print("\n")

In [None]:
#Cleaning the global_temps dataset

#Filling in all nan values with the avarage value of their column
find_avg_value= ["Jun","Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

#temp assigning the nan values to 0 for the summing
dataset_3[find_avg_value] = dataset_3[find_avg_value].fillna(0)

#casting to float
dataset_3[find_avg_value] = dataset_3[find_avg_value].astype(float)

agv_value = dataset_3[find_avg_value].mean()
dataset_3[find_avg_value] = dataset_3[find_avg_value].fillna(agv_value) 


#Updating the dataset_3 csv
dataset_3.to_csv("global_temps.csv", index= False)

#Testing if the change was made
print("Missing values from Dataset 3: global_temps")
print(dataset_3.isnull().sum())
print("\n")


Checking if the current data types for all attributes in all datasets are of the correct types for further processing

In [None]:
#dataset_1

#Before 
dataset_1.info()

#The dataset is of the correct types, no chanages to be done here

In [None]:
#dataset_2

#Before 
dataset_2.info()

#The dataset is of the correct types, no chanages to be done here


In [None]:
#dataset_3

dataset_3.info()

Merging the 3 datasets together, dataset_2 "Historical Tropical Storm" with joining dataset_1 "CO2 emission by countries" using their year attribute, then taking the joined datasets and connecting it to "global_temps" using the year attribute 

In [None]:
import csv

#changing the attribute names to match where the are being joined
dataset_2 = dataset_2.rename(columns={"YEAR": "Year"})

dataset_4 = pd.merge(dataset_2, dataset_1, on= "Year", how= "left") #left for keeping all data from dataset_1

print("Merged Dataset of Historical Tropical Storm and CO2 emission by countries")
print(dataset_4.head(5))

final_dataset = pd.merge(dataset_4, dataset_3, on= "Year", how= "left") #left for keeping all data from dataset_4
print(final_dataset.head(5))

#creating and writing to a new csv

with open("completed_dataset_for_IS_project_25.csv", 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(final_dataset.columns)

    for index, row in final_dataset.iterrows():
        writer.writerow(row)

print("Final Dataset csv has been created")

The merged data set currently has 1,048,576 rows, I'm going to remove te rows that lacks the necessary information further reducing the size of the dataset

The necessary columns are:  "Year", "MONTH", "DAY", "LAT", "LONG", "WIND_KTS", "PRESSURE", "CAT", "Shape_Leng", "CO2 emission (Tons)", "Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep", "Oct", "Nov" and "Dec"

If any data is missing then remove that row

In [None]:
import dask.dataframe as dd

necessary_columns = ["Year", "MONTH", "DAY", "LAT", "LONG", "WIND_KTS", "PRESSURE", "CAT", "Shape_Leng", "CO2 emission (Tons)", "Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

large_dataset = dd.read_csv("completed_dataset_for_IS_project_25.csv")

cleaned_final_dataset = large_dataset.dropna(subset=necessary_columns)

print(cleaned_final_dataset.head(5))

#Updating the dataset_3 csv
cleaned_final_dataset.to_csv("completed_dataset_for_IS_project_25.csv", index= False, single_file=True)

print("The dataset is ready for processing")