Imports and Load data

In [23]:
import pandas as pd
import sqlite3

# Connect to SQLite
conn = sqlite3.connect("../climate_data.db")
df = pd.read_sql_query("SELECT * FROM avg_temperatures", conn)
conn.close()

# Initial preview
print(df.head())


     City  Year  AvgMaxTemp
0  Zurich  2014       14.83
1  Zurich  2015       14.78
2  Zurich  2016       13.97
3  Zurich  2017       14.46
4  Zurich  2018       15.89


Checking data types

In [18]:
print(df.dtypes)

City           object
Year            int64
AvgMaxTemp    float64
dtype: object


Cleaning

In [19]:
# Check for nulls
print(df.isnull().sum())

# Drop missing data if any
df.dropna(inplace=True)

# Remove duplicates if any
df.drop_duplicates(inplace=True)


City          0
Year          0
AvgMaxTemp    0
dtype: int64


Regex

In [20]:
import re

# Simulate string type
df["AvgMaxTemp"] = df["AvgMaxTemp"].astype(str)

# Extract numbers using regex
df["AvgMaxTemp"] = df["AvgMaxTemp"].str.extract(r"([-+]?\d*\.\d+|\d+)").astype(float)


Save Data

In [21]:
conn = sqlite3.connect("../climate_data.db")
df.to_sql("avg_temperatures_cleaned", conn, if_exists="replace", index=False)
conn.close()
