In [1]:
import pandas as pd

# Load the A1-turbine.csv file
file_path = 'Dataset/A1-turbine.csv'
turbine_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
turbine_data.head()

Unnamed: 0,height,fall,net_fall,flow,power
0,624.0,89.16,89.765,3.5,2512.85
1,628.0,93.16,93.765,3.5,2583.79
2,602.0,67.84,66.415,6.5,3748.77
3,599.0,64.84,63.415,6.5,3520.65
4,630.0,94.69,93.54,8.0,6673.84


In [2]:

# Reloading the data with correct headers
turbine_data = pd.read_csv(file_path, sep=',', header=0, comment='#')

# Displaying the first few rows of the dataset with correct headers
turbine_data.head()


Unnamed: 0,height,fall,net_fall,flow,power
0,624.0,89.16,89.765,3.5,2512.85
1,628.0,93.16,93.765,3.5,2583.79
2,602.0,67.84,66.415,6.5,3748.77
3,599.0,64.84,63.415,6.5,3520.65
4,630.0,94.69,93.54,8.0,6673.84


In [3]:
# Displaying the dataset information and the first few rows with the corrected column names
turbine_data.info()
turbine_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   height    451 non-null    float64
 1   fall      451 non-null    float64
 2   net_fall  451 non-null    float64
 3   flow      451 non-null    float64
 4   power     451 non-null    float64
dtypes: float64(5)
memory usage: 17.7 KB


Unnamed: 0,height,fall,net_fall,flow,power
0,624.0,89.16,89.765,3.5,2512.85
1,628.0,93.16,93.765,3.5,2583.79
2,602.0,67.84,66.415,6.5,3748.77
3,599.0,64.84,63.415,6.5,3520.65
4,630.0,94.69,93.54,8.0,6673.84


In [4]:
# Generating a statistical summary of the dataset
turbine_data.describe()

Unnamed: 0,height,fall,net_fall,flow,power
count,451.0,451.0,451.0,451.0,451.0
mean,611.873614,77.147605,76.487583,5.921286,3868.492262
std,11.088312,10.745153,11.070332,1.782916,1399.998523
min,591.0,56.79,55.14,3.0,1675.16
25%,603.0,68.865,67.4775,4.5,2710.67
50%,612.0,76.84,76.415,6.0,3689.14
75%,622.0,86.665,86.1775,7.5,4901.16
max,630.0,96.21,95.935,9.0,7261.37


In [5]:
turbine_data.isnull().sum()

height      0
fall        0
net_fall    0
flow        0
power       0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler

# Initializing the StandardScaler
scaler = StandardScaler()

# Fitting the scaler to the data and transforming it
normalized_turbine_data = scaler.fit_transform(turbine_data)

# Converting the normalized data back to a DataFrame for easier interpretation
normalized_turbine_df = pd.DataFrame(normalized_turbine_data, columns=turbine_data.columns)

# Displaying the first few rows of the normalized data
normalized_turbine_df.head()

Unnamed: 0,height,fall,net_fall,flow,power
0,1.094833,1.119178,1.200701,-1.359556,-0.969392
1,1.455974,1.491852,1.562429,-1.359556,-0.918664
2,-0.891441,-0.867176,-0.910882,0.324949,-0.085611
3,-1.162297,-1.146682,-1.182178,0.324949,-0.248735
4,1.636544,1.6344,1.542081,1.167201,2.006047


In [9]:
# Convert the normalized data back to DataFrames
normalized_turbine_df = pd.DataFrame(normalized_turbine_data, columns=turbine_data.columns)
# Save the normalized datasets to CSV files
normalized_turbine_file_path = 'Preprocessed/Preprocessed_A1_turbine.csv'
normalized_turbine_df.to_csv(normalized_turbine_file_path, index=False)
print("Normalized turbine dataset saved to:", normalized_turbine_file_path)


Normalized turbine dataset saved to: Preprocessed/Preprocessed_A1_turbine.csv


In [None]:
# Now Working with the Synthetic data file

In [10]:
# Load the A1-synthetic.txt file
synthetic_file_path = 'Dataset/A1-synthetic.csv'
synthetic_data = pd.read_csv(synthetic_file_path, sep=',', header=0)

# Display the first few rows of the dataset
synthetic_data.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,z
0,37.34411,10.542156,0.969185,3.568534,96.798733,3.429026,75.810196,0,20.002459,11.805369
1,4.089849,11.894301,0.467775,1.279044,100.149383,3.190073,76.423095,0,12.702628,5.125025
2,-32.333439,10.968631,0.238486,1.410745,100.642075,3.093934,78.758727,1,10.723848,3.218553
3,-45.632977,11.509606,0.924938,3.404069,105.963016,2.884269,83.02775,0,19.946593,12.955092
4,-41.543394,10.117186,0.31518,1.02012,97.371423,2.81582,77.194463,0,11.105024,1.919094


In [11]:
synthetic_data.describe()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.968392,11.00537,0.511468,2.417192,100.108795,2.726705,78.148504,0.313,13.8683,6.367773
std,28.418028,0.58409,0.283292,1.147284,4.893623,0.275193,5.016537,0.463946,3.253272,3.841009
min,-49.94291,10.000103,0.000147,1.002395,85.147019,1.974029,62.554174,0.0,10.1,-0.791114
25%,-23.938244,10.510135,0.267591,1.258367,96.934414,2.540117,74.814175,0.0,10.861926,3.124304
50%,2.852507,10.990934,0.515436,3.042135,99.920497,2.710712,78.101481,0.0,13.089869,5.665139
75%,25.494062,11.517569,0.757415,3.498211,103.482385,2.902578,81.604561,1.0,16.366096,9.260205
max,49.889593,11.999189,0.999727,3.999776,116.239538,3.600914,94.50217,1.0,22.033951,16.08023


In [12]:
synthetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   v1      1000 non-null   float64
 1   v2      1000 non-null   float64
 2   v3      1000 non-null   float64
 3   v4      1000 non-null   float64
 4   v5      1000 non-null   float64
 5   v6      1000 non-null   float64
 6   v7      1000 non-null   float64
 7   v8      1000 non-null   int64  
 8   v9      1000 non-null   float64
 9   z       1000 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 78.3 KB


In [13]:
synthetic_data.isnull().sum()

v1    0
v2    0
v3    0
v4    0
v5    0
v6    0
v7    0
v8    0
v9    0
z     0
dtype: int64

In [14]:
# Initializing the StandardScaler for the synthetic data
scaler_synthetic = StandardScaler()

# Fitting the scaler to the synthetic data and transforming it
normalized_synthetic_data = scaler_synthetic.fit_transform(synthetic_data)

# Converting the normalized data back to a DataFrame for easier interpretation
normalized_synthetic_df = pd.DataFrame(normalized_synthetic_data, columns=synthetic_data.columns)

# Displaying the first few rows of the normalized synthetic data
normalized_synthetic_df.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,z
0,1.280663,-0.793451,1.616521,1.004039,-0.676741,2.553385,-0.466353,-0.674985,1.886479,1.416377
1,0.109896,1.522669,-0.15431,-0.992534,0.008298,1.684637,-0.344116,-0.674985,-0.358487,-0.323709
2,-1.172442,-0.062932,-0.964088,-0.877683,0.109029,1.335112,0.121703,1.481516,-0.967034,-0.820304
3,-1.640673,0.863717,1.460254,0.860615,1.196895,0.572846,0.973119,-0.674985,1.869298,1.715855
4,-1.496693,-1.521392,-0.693229,-1.218331,-0.559655,0.32399,-0.190274,-0.674985,-0.849809,-1.158785


In [15]:
# Convert the normalized data back to DataFrames
normalized_synthetic_df = pd.DataFrame(normalized_synthetic_data, columns=synthetic_data.columns)
# Save the normalized datasets to CSV files
normalized_synthetic_file_path = 'Preprocessed/Preprocessed_A1_synthetic.csv'
normalized_synthetic_df.to_csv(normalized_synthetic_file_path, index=False)
# The paths to the saved files
print("Normalized synthetic dataset saved to:", normalized_synthetic_file_path)

Normalized synthetic dataset saved to: Preprocessed/Preprocessed_A1_synthetic.csv


In [16]:
# Working with 3rd Dataset

In [17]:
import pandas as pd

# Load the A1-adverstising.txt file
file_path = 'Dataset/advertising.csv'
advertising_data = pd.read_csv(file_path, sep=',')

# Display the first few rows of the dataset
advertising_data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [18]:
advertising_data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.90,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.50,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,Fundamental modular algorithm,Duffystad,1,Lebanon,2016-02-11 21:49:00,1
996,51.30,45,67782.17,134.42,Grass-roots cohesive monitoring,New Darlene,1,Bosnia and Herzegovina,2016-04-22 02:07:01,1
997,51.63,51,42415.72,120.37,Expanded intangible solution,South Jessica,1,Mongolia,2016-02-01 17:24:57,1
998,55.55,19,41920.79,187.95,Proactive bandwidth-monitored policy,West Steven,0,Guatemala,2016-03-24 02:35:54,0


In [19]:
print(advertising_data.info())
print(advertising_data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB
None
Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City            

In [20]:
# Drop unnecessary columns for training models like Ad Topic Line, City, Country, and Timestamp,
# as they are not directly related to predicting the target.
advertising_data = advertising_data.drop(columns=["Ad Topic Line", "City", "Country", "Timestamp"])

In [21]:
advertising_data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
0,68.95,35,61833.90,256.09,0,0
1,80.23,31,68441.85,193.77,1,0
2,69.47,26,59785.94,236.50,0,0
3,74.15,29,54806.18,245.89,1,0
4,68.37,35,73889.99,225.58,0,0
...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,1,1
996,51.30,45,67782.17,134.42,1,1
997,51.63,51,42415.72,120.37,1,1
998,55.55,19,41920.79,187.95,0,0


In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [23]:
label_encoder = LabelEncoder()
advertising_data["Male"] = label_encoder.fit_transform(advertising_data["Male"])

In [None]:
scaler = StandardScaler()
numerical_features = ["Daily Time Spent on Site", "Age", "Area Income", "Daily Internet Usage"]
advertising_data[numerical_features] = scaler.fit_transform(advertising_data[numerical_features])

In [25]:
advertising_data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
0,0.249267,-0.114905,0.509691,1.734030,0,0
1,0.961132,-0.570425,1.002530,0.313805,1,0
2,0.282083,-1.139826,0.356949,1.287589,0,0
3,0.577432,-0.798185,-0.014456,1.501580,1,0
4,0.212664,-0.114905,1.408868,1.038731,0,0
...,...,...,...,...,...,...
995,0.502963,-0.684305,1.222006,0.651314,1,1
996,-0.864601,1.023896,0.953329,-1.038735,1,1
997,-0.843775,1.707176,-0.938570,-1.358924,1,1
998,-0.596389,-1.936986,-0.975484,0.181172,0,0


In [29]:
from sklearn.preprocessing import MinMaxScaler
# Normalization
normalizer = MinMaxScaler()
advertising_data[numerical_features] = normalizer.fit_transform(advertising_data[numerical_features])

In [30]:
advertising_data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
0,0.617882,0.380952,0.730472,0.916031,0,0
1,0.809621,0.285714,0.831375,0.538746,1,0
2,0.626721,0.166667,0.699200,0.797433,0,0
3,0.706272,0.238095,0.623160,0.854280,1,0
4,0.608023,0.380952,0.914568,0.731323,0,0
...,...,...,...,...,...,...
995,0.686215,0.261905,0.876310,0.628405,1,1
996,0.317865,0.619048,0.821302,0.179441,1,1
997,0.323474,0.761905,0.433959,0.094382,1,1
998,0.390107,0.000000,0.426401,0.503511,0,0


In [31]:
# Export preprocessed data to CSV
advertising_data.to_csv("Preprocessed/preprocessed_Adversting_data.csv", index=False)
print("Preprocessed data exported successfully to 'preprocessed_data.csv'")


Preprocessed data exported successfully to 'preprocessed_data.csv'


In [None]:
# Preprocessing of all the 3 datasets is done successfully
# Now we can proceed with the model building & Training.