In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [23]:
# TODO: create seperate a file for preprocessed data??
df = pd.read_csv('../data/raw/average-monthly-surface-temperature.csv')
df.reset_index()
df = df.rename(columns={'Average surface temperature': 'Daily Average Temp', 'Average surface temperature.1': 'Monthly Average Temp'})

In [24]:
df.sample(5)

Unnamed: 0,Entity,Code,year,Day,Daily Average Temp,Monthly Average Temp
71827,Germany,DEU,1975,1975-08-15,19.175041,9.095744
143363,Philippines,PHL,1986,1986-12-15,24.455791,25.443499
161886,Slovenia,SVN,2000,2000-07-15,17.86719,10.674691
117019,Mauritius,MUS,2001,2001-08-15,20.985521,23.322418
172489,Sweden,SWE,1949,1949-02-15,-2.759978,4.084469


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Entity                198900 non-null  object 
 1   Code                  198900 non-null  object 
 2   year                  198900 non-null  int64  
 3   Day                   198900 non-null  object 
 4   Daily Average Temp    198900 non-null  float64
 5   Monthly Average Temp  198900 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 9.1+ MB


In [26]:
df.describe()

Unnamed: 0,year,Daily Average Temp,Monthly Average Temp
count,198900.0,198900.0,198900.0
mean,1982.0,18.072073,18.072073
std,24.53575,10.246142,8.710114
min,1940.0,-36.240032,-21.529121
25%,1961.0,12.304079,10.569263
50%,1982.0,22.055794,21.856285
75%,2003.0,25.317015,25.142885
max,2024.0,39.889374,29.79422


# Missing values

In [27]:
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

# there are no missing values

Missing Values:
Entity                  0
Code                    0
year                    0
Day                     0
Daily Average Temp      0
Monthly Average Temp    0
dtype: int64


# Duplicates

In [28]:
duplicate_rows = df[df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

# there are no duplicate rows

Duplicate Rows:
Empty DataFrame
Columns: [Entity, Code, year, Day, Daily Average Temp, Monthly Average Temp]
Index: []


# Convert Data Types

In [29]:
df["Day"] = pd.to_datetime(df["Day"])
df

Unnamed: 0,Entity,Code,year,Day,Daily Average Temp,Monthly Average Temp
0,Afghanistan,AFG,1940,1940-01-15,-2.032494,11.327695
1,Afghanistan,AFG,1940,1940-02-15,-0.733503,11.327695
2,Afghanistan,AFG,1940,1940-03-15,1.999134,11.327695
3,Afghanistan,AFG,1940,1940-04-15,10.199754,11.327695
4,Afghanistan,AFG,1940,1940-05-15,17.942135,11.327695
...,...,...,...,...,...,...
198895,Zimbabwe,ZWE,2024,2024-08-15,20.559408,22.921250
198896,Zimbabwe,ZWE,2024,2024-09-15,23.642931,22.921250
198897,Zimbabwe,ZWE,2024,2024-10-15,24.407030,22.921250
198898,Zimbabwe,ZWE,2024,2024-11-15,25.672321,22.921250


# Verifying Entity and Code columns

In [30]:
for name, num_of_occurrences in df['Entity'].value_counts().items():
    # check if every country has the same number of entries
    if num_of_occurrences != 1020:
        print(name, "has", num_of_occurrences)

for name in df["Entity"].unique():
    # check if every country name (Entity column) matches the country code 
    correct_code = df.loc[df["Entity"] == name].iloc[0]["Code"]      
    incorrect_rows = df.loc[(df["Entity"] == name) & (df["Code"] != correct_code)]
    if incorrect_rows.shape[0] > 0:
        display(incorrect_rows)


# Drop Unnecessary Columns

In [31]:
# having both "Enity" and "Code" is redundant, so I'm dropping a column
df.drop("Entity", axis=1, inplace=True, errors='ignore')
df

Unnamed: 0,Code,year,Day,Daily Average Temp,Monthly Average Temp
0,AFG,1940,1940-01-15,-2.032494,11.327695
1,AFG,1940,1940-02-15,-0.733503,11.327695
2,AFG,1940,1940-03-15,1.999134,11.327695
3,AFG,1940,1940-04-15,10.199754,11.327695
4,AFG,1940,1940-05-15,17.942135,11.327695
...,...,...,...,...,...
198895,ZWE,2024,2024-08-15,20.559408,22.921250
198896,ZWE,2024,2024-09-15,23.642931,22.921250
198897,ZWE,2024,2024-10-15,24.407030,22.921250
198898,ZWE,2024,2024-11-15,25.672321,22.921250


# Data Visualization

In [32]:
'''
Global Temperature Trend Over Time
Overall trend of average surface temperatures from 1940 to 2024
'''

# For each year, average the 

'\nGlobal Temperature Trend Over Time\nOverall trend of average surface temperatures from 1940 to 2024\n'

In [33]:
'''
# plot average temperatures for each year
avg_temps_by_year = df.groupby("year").agg(
    avg_temp=("Daily Average Temp", "mean"),
).reset_index()

avg_temps_by_year = pd.melt(avg_temps_by_year, ["year"], value_name='Temperature (°C)', var_name="Legend")
sns.lineplot(data=avg_temps_by_year, x="year", y='Temperature (°C)', hue="Legend")
'''

'\n# plot average temperatures for each year\navg_temps_by_year = df.groupby("year").agg(\n    avg_temp=("Daily Average Temp", "mean"),\n).reset_index()\n\navg_temps_by_year = pd.melt(avg_temps_by_year, ["year"], value_name=\'Temperature (°C)\', var_name="Legend")\nsns.lineplot(data=avg_temps_by_year, x="year", y=\'Temperature (°C)\', hue="Legend")\n'

In [34]:
'''
# plot average temperatures for each month (Day column)
df['Month'] = pd.to_datetime(df['Day']).dt.month
#df.reset_index()
avg_temps_by_month = df.groupby(["Month", "year"]).aggregate(
    avg_temp=("Monthly Average Temp", "mean"),
    countries=("Code", "count"),
)

#avg_temps_by_month = pd.melt(avg_temps_by_month, ["Month"], value_name='Temperature (°C)', var_name="Legend")
#sns.lineplot(data=avg_temps_by_month, x="Month", y='Temperature (°C)', hue="Legend")
#df.sample(10)

#print(avg_temps_by_month.first())
#avg_temps_by_month
#avg_temps_by_month = df.groupby(by="Month").sum()
display(avg_temps_by_month)

# TODO: do bar chart for this
'''

'\n# plot average temperatures for each month (Day column)\ndf[\'Month\'] = pd.to_datetime(df[\'Day\']).dt.month\n#df.reset_index()\navg_temps_by_month = df.groupby(["Month", "year"]).aggregate(\n    avg_temp=("Monthly Average Temp", "mean"),\n    countries=("Code", "count"),\n)\n\n#avg_temps_by_month = pd.melt(avg_temps_by_month, ["Month"], value_name=\'Temperature (°C)\', var_name="Legend")\n#sns.lineplot(data=avg_temps_by_month, x="Month", y=\'Temperature (°C)\', hue="Legend")\n#df.sample(10)\n\n#print(avg_temps_by_month.first())\n#avg_temps_by_month\n#avg_temps_by_month = df.groupby(by="Month").sum()\ndisplay(avg_temps_by_month)\n\n# TODO: do bar chart for this\n'

In [35]:
'''
new_df = pd.DataFrame()
new_df['Months'] = df['Month']
'''

"\nnew_df = pd.DataFrame()\nnew_df['Months'] = df['Month']\n"

In [36]:
#df.drop("Month", axis=1)

# Export preprocessed dataset to new CSV file

In [37]:
df.to_csv('../data/processed/preprocessed_data.csv', index=False)