
**1.All necessary Libraries Imported**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , MinMaxScaler

**2. Load the Dataset**

In [2]:
df= pd.read_csv("annual_temp.csv")
dt = pd.read_csv("global_co2.csv")
print("The Data of Annual Temperature: \n" ,df)
print("\nThe data of Global_CO2: \n" ,dt)

The Data of Annual Temperature: 
       Source  Year    Mean
0       GCAG  2015  0.8990
1    GISTEMP  2015  0.8700
2       GCAG  2014  0.7402
3    GISTEMP  2014  0.7500
4       GCAG  2013  0.6687
..       ...   ...     ...
267  GISTEMP  1882 -0.1000
268     GCAG  1881 -0.0707
269  GISTEMP  1881 -0.1200
270     GCAG  1880 -0.1247
271  GISTEMP  1880 -0.2000

[272 rows x 3 columns]

The data of Global_CO2: 
      Year  Total  Gas Fuel  Liquid Fuel  Solid Fuel  Cement  Gas Flaring  \
0    1751      3         0            0           3       0            0   
1    1752      3         0            0           3       0            0   
2    1753      3         0            0           3       0            0   
3    1754      3         0            0           3       0            0   
4    1755      3         0            0           3       0            0   
..    ...    ...       ...          ...         ...     ...          ...   
255  2006   8370      1525         3089        3339     356

**3. Get the Inforamtion ,Summary stats, Cloumn name and Shape of Datasets**

In [3]:
print("\nAnnual Temperature Data Info:")
print(df.info())
print("\nAnnual Temperature Data Summary Statistics:")
print(df.describe())
print("\nShape of Annual Temperature Data:")
print(df.shape)



Annual Temperature Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Source  272 non-null    object 
 1   Year    272 non-null    int64  
 2   Mean    272 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.5+ KB
None

Annual Temperature Data Summary Statistics:
              Year        Mean
count   272.000000  272.000000
mean   1947.500000    0.029980
std      39.331123    0.312131
min    1880.000000   -0.480000
25%    1913.750000   -0.202050
50%    1947.500000   -0.057500
75%    1981.250000    0.206900
max    2015.000000    0.899000

Shape of Annual Temperature Data:
(272, 3)


In [4]:
print("\nGlobal CO2 Data Info:")
print(dt.info())
print("\nGlobal CO2 Data Summary Statistics:")
print(dt.describe())
print("\nShape of Global CO2 Data:")
print(dt.shape)


Global CO2 Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         260 non-null    int64  
 1   Total        260 non-null    int64  
 2   Gas Fuel     260 non-null    int64  
 3   Liquid Fuel  260 non-null    int64  
 4   Solid Fuel   260 non-null    int64  
 5   Cement       260 non-null    int64  
 6   Gas Flaring  260 non-null    int64  
 7   Per Capita   61 non-null     float64
dtypes: float64(1), int64(7)
memory usage: 16.4 KB
None

Global CO2 Data Summary Statistics:
              Year        Total    Gas Fuel  Liquid Fuel   Solid Fuel  \
count   260.000000   260.000000   260.00000   260.000000   260.000000   
mean   1880.500000  1402.788462   185.20000   495.819231   674.569231   
std      75.199734  2253.098527   396.58556   934.308074   868.368580   
min    1751.000000     3.000000     0.00000     0.000000     3.00000

In [6]:
#Display Column names of Global co2 dataset
print("Column Names:", dt.columns)

Column Names: Index(['Year', 'Total', 'Gas Fuel', 'Liquid Fuel', 'Solid Fuel', 'Cement',
       'Gas Flaring', 'Per Capita'],
      dtype='object')


In [7]:
#Display Column names of Annual Temperture  dataset
print("Column Names:", df.columns)

Column Names: Index(['Source', 'Year', 'Mean'], dtype='object')


**4.Data Cleaning**

1. Handling Missing Values in Datasets


In [8]:
#Check for missing values in both data

print("Missing values in Annual Temperature Data:")
print(df.isnull().sum())

print("\nMissing values in Global CO2 Data:")
print(dt.isnull().sum())

Missing values in Annual Temperature Data:
Source    0
Year      0
Mean      0
dtype: int64

Missing values in Global CO2 Data:
Year             0
Total            0
Gas Fuel         0
Liquid Fuel      0
Solid Fuel       0
Cement           0
Gas Flaring      0
Per Capita     199
dtype: int64


In [9]:
# filling the missing values in per capita column with mean
dt.fillna(dt.mean(), inplace=True)
dt.isnull().sum()

Unnamed: 0,0
Year,0
Total,0
Gas Fuel,0
Liquid Fuel,0
Solid Fuel,0
Cement,0
Gas Flaring,0
Per Capita,0


2.Handling Dublicated Values

In [10]:
# Check for duplicates
print("Duplicates in Annual Temperature Data:")
print(df.duplicated().sum())

print("\nDuplicates in Global CO2 Data:")
print(dt.duplicated().sum())

Duplicates in Annual Temperature Data:
0

Duplicates in Global CO2 Data:
0



**5. Standarization**

In [11]:
# Standardize the data
scaler = StandardScaler()
df_s = pd.DataFrame(scaler.fit_transform(df.select_dtypes(include=[float, int])), columns=df.select_dtypes(include=[float, int]).columns)
dt_s= pd.DataFrame(scaler.fit_transform(dt.select_dtypes(include=[float, int])), columns=dt.select_dtypes(include=[float, int]).columns)

In [12]:
print("Standarized Data of Annual Temperature Dataset:\n" , df_s)
print("\nStandarized Data of Global CO2 Dataset:\n" , dt_s)

Standarized Data of Annual Temperature Dataset:
          Year      Mean
0    1.719362  2.789283
1    1.719362  2.696202
2    1.693890  2.279584
3    1.693890  2.311039
4    1.668418  2.050092
..        ...       ...
267 -1.668418 -0.417196
268 -1.693890 -0.323152
269 -1.693890 -0.481390
270 -1.719362 -0.496475
271 -1.719362 -0.738165

[272 rows x 2 columns]

Standarized Data of Global CO2 Dataset:
          Year     Total  Gas Fuel  Liquid Fuel  Solid Fuel    Cement  \
0   -1.725402 -0.622471 -0.467887    -0.531704   -0.774860 -0.433810   
1   -1.712078 -0.622471 -0.467887    -0.531704   -0.774860 -0.433810   
2   -1.698755 -0.622471 -0.467887    -0.531704   -0.774860 -0.433810   
3   -1.685431 -0.622471 -0.467887    -0.531704   -0.774860 -0.433810   
4   -1.672108 -0.622471 -0.467887    -0.531704   -0.774860 -0.433810   
..        ...       ...       ...          ...         ...       ...   
255  1.672108  3.098244  3.384853     2.780862    3.074235  4.086956   
256  1.685431  3.1854

**5.Normalization**

In [13]:
normalizer = MinMaxScaler()
df_n = pd.DataFrame(normalizer.fit_transform(df.select_dtypes(include=[float, int])), columns=df.select_dtypes(include=[float, int]).columns)
dt_n = pd.DataFrame(normalizer.fit_transform(dt.select_dtypes(include=[float, int])), columns=dt.select_dtypes(include=[float, int]).columns)

In [14]:
print("Normalized Data of annual_temperature Dataset:\n" , df_n)
print("\nNormalized  Data of Global CO2 Dataset:\n" , dt_n)

Normalized Data of annual_temperature Dataset:
          Year      Mean
0    1.000000  1.000000
1    1.000000  0.978970
2    0.992593  0.884844
3    0.992593  0.891951
4    0.985185  0.832995
..        ...       ...
267  0.014815  0.275562
268  0.007407  0.296809
269  0.007407  0.261059
270  0.000000  0.257650
271  0.000000  0.203046

[272 rows x 2 columns]

Normalized  Data of Global CO2 Dataset:
          Year     Total  Gas Fuel  Liquid Fuel  Solid Fuel    Cement  \
0    0.000000  0.000000  0.000000     0.000000    0.000000  0.000000   
1    0.003861  0.000000  0.000000     0.000000    0.000000  0.000000   
2    0.007722  0.000000  0.000000     0.000000    0.000000  0.000000   
3    0.011583  0.000000  0.000000     0.000000    0.000000  0.000000   
4    0.015444  0.000000  0.000000     0.000000    0.000000  0.000000   
..        ...       ...       ...          ...         ...       ...   
255  0.984556  0.913029  0.896005     0.989430    0.868976  0.791111   
256  0.988417  0.93441

**The Cleaned and Processed Data**

In [15]:
print("\nCleaned Annual Temperature Data:")
print(df.head())
print("\nCleaned Global CO2 Data:")
print(dt.head())

print("\nStandardized Annual Temperature Data:")
print(df_s.head())
print("\nStandardized Global CO2 Data:")
print(dt_s.head())

print("\nNormalized Annual Temperature Data:")
print(df_n.head())
print("\nNormalized Global CO2 Data:")
print(dt_n.head())


Cleaned Annual Temperature Data:
    Source  Year    Mean
0     GCAG  2015  0.8990
1  GISTEMP  2015  0.8700
2     GCAG  2014  0.7402
3  GISTEMP  2014  0.7500
4     GCAG  2013  0.6687

Cleaned Global CO2 Data:
   Year  Total  Gas Fuel  Liquid Fuel  Solid Fuel  Cement  Gas Flaring  \
0  1751      3         0            0           3       0            0   
1  1752      3         0            0           3       0            0   
2  1753      3         0            0           3       0            0   
3  1754      3         0            0           3       0            0   
4  1755      3         0            0           3       0            0   

   Per Capita  
0    1.054754  
1    1.054754  
2    1.054754  
3    1.054754  
4    1.054754  

Standardized Annual Temperature Data:
       Year      Mean
0  1.719362  2.789283
1  1.719362  2.696202
2  1.693890  2.279584
3  1.693890  2.311039
4  1.668418  2.050092

Standardized Global CO2 Data:
       Year     Total  Gas Fuel  Liquid Fuel  S

**Model Creation**

*1.Set the targetted variables*

In [18]:
# Extract features and target variable
X = dt[['Year']]
y = dt[['Per Capita']]

Xgg = df[['Year']]
ygg = df[['Mean']]

In [19]:
print(X)
print(y)
print(Xgg)
print(ygg)

     Year
0    1751
1    1752
2    1753
3    1754
4    1755
..    ...
255  2006
256  2007
257  2008
258  2009
259  2010

[260 rows x 1 columns]
     Per Capita
0      1.054754
1      1.054754
2      1.054754
3      1.054754
4      1.054754
..          ...
255    1.270000
256    1.280000
257    1.300000
258    1.280000
259    1.330000

[260 rows x 1 columns]
     Year
0    2015
1    2015
2    2014
3    2014
4    2013
..    ...
267  1882
268  1881
269  1881
270  1880
271  1880

[272 rows x 1 columns]
       Mean
0    0.8990
1    0.8700
2    0.7402
3    0.7500
4    0.6687
..      ...
267 -0.1000
268 -0.0707
269 -0.1200
270 -0.1247
271 -0.2000

[272 rows x 1 columns]


*2. Split into training and testing data*

In [20]:
# Setting data into train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_ggtrain, Xggtest, y_ggtrain, y_ggtest = train_test_split(Xgg, ygg, test_size=0.2, random_state=42)
