Loading the dataset csv's into data frames and printing the first 10 values in the data set for each data frame to ensure that the data is being read and stored correctly

In [23]:
import pandas as pd

dataset_1 = pd.read_csv("CO2 emission by countries.csv")
dataset_2 = pd.read_csv("Historical Tropical Storm.csv")
dataset_3 = pd.read_csv("global_temps.csv")

print("From Dataset 1: CO2 emission by countries")
print(dataset_1.head(10))
print("\n")

print("From Dataset 2: Historical Tropical Storm")
print(dataset_2.head(10))
print("\n")

print("From Dataset 3: global_temps")
print(dataset_3.head(10))
print("\n")

From Dataset 1: CO2 emission by countries
       Country Code Calling Code  Year  CO2 emission (Tons)  Population(2022)  \
0  Afghanistan   AF           93  1750                  0.0        41128771.0   
1  Afghanistan   AF           93  1751                  0.0        41128771.0   
2  Afghanistan   AF           93  1752                  0.0        41128771.0   
3  Afghanistan   AF           93  1753                  0.0        41128771.0   
4  Afghanistan   AF           93  1754                  0.0        41128771.0   
5  Afghanistan   AF           93  1755                  0.0        41128771.0   
6  Afghanistan   AF           93  1756                  0.0        41128771.0   
7  Afghanistan   AF           93  1757                  0.0        41128771.0   
8  Afghanistan   AF           93  1758                  0.0        41128771.0   
9  Afghanistan   AF           93  1759                  0.0        41128771.0   

       Area % of World Density(km2)  
0  652230.0      0.40%      

The "CO2 emission by countries" dataset currently has these attributes: Country", "Code", "Calling Code", "Year", "CO2 emission (Tons)", "Population(2022)", "Area", "% of World" and "Density(km2)" but from this dataset we only require "Year" and "CO2 emission (Tons)" so the others will be removed to reduce the size of the final dataset

In [None]:
#Removing unnecessary columns from the datasets "CO2 emission by countries"
dataset_1.columns = dataset_1.columns.str.strip()
dropping = ["Country", "Code", "Calling Code", "Population(2022)", "Area", "% of World", "Density(km2)"]
dataset_1.drop(columns=dropping, inplace= True, errors= "ignore")

#updating the csv
dataset_1.to_csv("CO2 emission by countries.csv", index= False)
print("CO2 emission by countries dataset, has been updated")

CO2 emission by countries dataset, has been updated


The "Historical Tropical Storm" dataset currently has these attributes: "FID", "YEAR", "MONTH", "DAY", "AD_TIME", "BTID", "NAME", "LAT", "LONG", "WIND_KTS", "PRESSURE", "CAT", "BASIN" and "Shape_Leng" but from this dataset we only require "YEAR", "MONTH", "DAY", "LAT", "LONG", "WIND_KTS", "PRESSURE", "CAT" and "Shape_Leng" so the others will be removed to reduce the size of the final dataset

In [None]:
#Removing unnecessary columns from the dataset "Historical Tropical Storm"
dataset_2.columns = dataset_2.columns.str.strip()
dropping_2 = ["FID", "AD_TIME", "BTID", "NAME", "BASIN"]
dataset_2.drop(columns=dropping_2, inplace= True, errors= "ignore")

#updating the csv
dataset_2.to_csv("Historical Tropical Storm.csv", index= False)
print("Historical Tropical Storm dataset, has been updated")

Historical Tropical Storm dataset, has been updated


The "global_temps" dataset currently has these attributes: "Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "J-D", "D-N", "DJF", "MAM", "JJA" and "SON" but from this dataset we only require "Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov"and "Dec" so the others will be removed to reduce the size of the final dataset

In [None]:
#Removing unnecessary columns from the dataset "global_temps"
dataset_3.columns = dataset_3.columns.str.strip()
dropping_3 = ["J-D", "D-N", "DJF", "MAM", "JJA", "SON"]
dataset_3.drop(columns=dropping_3, inplace= True, errors= "ignore")

#updating the csv
dataset_3.to_csv("global_temps.csv", index= False)
print("global_temps dataset, has been updated")

global_temps dataset, has been updated


Cleaning the data by replacing nan values with approprate values for the specific column, by first identifying where the nan values are in the datasets

In [27]:
#Identify the number of missing values in the datasets
print("Missing values from Dataset 1: CO2 emission by countries")
print(dataset_1.isnull().sum())
print("\n")

print("Missing values from Dataset 2: Historical Tropical Storm")
print(dataset_2.isnull().sum())
print("\n")

print("Missing values from Dataset 3: global_temps")
print(dataset_3.isnull().sum())
print("\n")

Missing values from Dataset 1: CO2 emission by countries
Year                   0
CO2 emission (Tons)    0
dtype: int64


Missing values from Dataset 2: Historical Tropical Storm
YEAR          0
MONTH         0
DAY           0
LAT           0
LONG          0
WIND_KTS      0
PRESSURE      0
CAT           0
Shape_Leng    0
dtype: int64


Missing values from Dataset 3: global_temps
Year    0
Jan     0
Feb     0
Mar     0
Apr     0
May     0
Jun     1
Jul     1
Aug     1
Sep     1
Oct     1
Nov     1
Dec     1
dtype: int64




In [28]:
#Cleaning the global_temps dataset

#Filling in all nan values with the avarage value of their column
find_avg_value= ["Jun","Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

#temp assigning the nan values to 0 for the summing
dataset_3[find_avg_value] = dataset_3[find_avg_value].fillna(0)

#casting to float
dataset_3[find_avg_value] = dataset_3[find_avg_value].astype(float)

agv_value = dataset_3[find_avg_value].mean()
dataset_3[find_avg_value] = dataset_3[find_avg_value].fillna(agv_value) 


#Updating the dataset_3 csv
dataset_3.to_csv("global_temps.csv", index= False)

#Testing if the change was made
print("Missing values from Dataset 3: global_temps")
print(dataset_3.isnull().sum())
print("\n")


Missing values from Dataset 3: global_temps
Year    0
Jan     0
Feb     0
Mar     0
Apr     0
May     0
Jun     0
Jul     0
Aug     0
Sep     0
Oct     0
Nov     0
Dec     0
dtype: int64




Checking if the current data types for all attributes in all datasets are of the correct types for further processing

In [None]:
#dataset_1

#Before 
dataset_1.info()

#The dataset is of the correct types, no chanages to be done here

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59620 entries, 0 to 59619
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 59620 non-null  int64  
 1   CO2 emission (Tons)  59620 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 931.7 KB


In [None]:
#dataset_2

#Before 
dataset_2.info()

#The dataset is of the correct types, no chanages to be done here


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59228 entries, 0 to 59227
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   YEAR        59228 non-null  int64  
 1   MONTH       59228 non-null  int64  
 2   DAY         59228 non-null  int64  
 3   LAT         59228 non-null  float64
 4   LONG        59228 non-null  float64
 5   WIND_KTS    59228 non-null  int64  
 6   PRESSURE    59228 non-null  int64  
 7   CAT         59228 non-null  object 
 8   Shape_Leng  59228 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 4.1+ MB


In [31]:
#dataset_3

dataset_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    144 non-null    int64  
 1   Jan     144 non-null    float64
 2   Feb     144 non-null    float64
 3   Mar     144 non-null    float64
 4   Apr     144 non-null    float64
 5   May     144 non-null    float64
 6   Jun     144 non-null    float64
 7   Jul     144 non-null    float64
 8   Aug     144 non-null    float64
 9   Sep     144 non-null    float64
 10  Oct     144 non-null    float64
 11  Nov     144 non-null    float64
 12  Dec     144 non-null    float64
dtypes: float64(12), int64(1)
memory usage: 14.8 KB


Merging the 3 datasets together, dataset_2 "Historical Tropical Storm" with joining dataset_1 "CO2 emission by countries" using their year attribute, then taking the joined datasets and connecting it to "global_temps" using the year attribute 

In [None]:
import csv

#changing the attribute names to match where the are being joined
dataset_2 = dataset_2.rename(columns={"YEAR": "Year"})

dataset_4 = pd.merge(dataset_2, dataset_1, on= "Year", how= "left") #left for keeping all data from dataset_1

print("Merged Dataset of Historical Tropical Storm and CO2 emission by countries")
print(dataset_4.head(5))

final_dataset = pd.merge(dataset_4, dataset_3, on= "Year", how= "left") #left for keeping all data from dataset_4
print(final_dataset.head(5))

#creating and writing to a new csv

with open("completed_dataset_for_IS_project_25.csv", 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(final_dataset.columns)

    for index, row in final_dataset.iterrows():
        writer.writerow(row)

print("Final Dataset csv has been created")

Merged Dataset of Historical Tropical Storm and CO2 emission by countries
   Year  MONTH  DAY   LAT  LONG  WIND_KTS  PRESSURE CAT  Shape_Leng  \
0  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
1  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
2  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
3  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
4  1851      7    5  22.2 -97.6        80         0  H1    0.141421   

   CO2 emission (Tons)  
0                  0.0  
1                  0.0  
2                  0.0  
3                  0.0  
4                  0.0  
   Year  MONTH  DAY   LAT  LONG  WIND_KTS  PRESSURE CAT  Shape_Leng  \
0  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
1  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
2  1851      7    5  22.2 -97.6        80         0  H1    0.141421   
3  1851      7    5  22.2 -97.6        80         0  H1    0.1414

NameError: name 'csv' is not defined