<a href="https://colab.research.google.com/github/LAMECH9/LAMECH9/blob/main/Recap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd # Importing pandas for data manipulation
import numpy as np # Importing numpy for numerical computations
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Step 1 Load Dataset (CSV file will be created separately)
df = pd.read_csv('/content/data.csv') # Reading dataset into a pandas DataFrame
print("Original Data:")
print(df.head()) # Display the first 5 rows of the dataset

Original Data:
   feature_1  feature_2 category_column  missing_values  duplicate_column
0   4.967142  42.923146               B             NaN                 2
1  -1.382643  47.896773               B            86.0                 1
2   6.476885  48.286427               A            57.0                 5
3  15.230299  45.988614               A            29.0                 4
4  -2.341534  49.193571               A            78.0                 1


In [None]:
print(df.tail())# enable view the last five rows of the data set

     feature_1  feature_2 category_column  missing_values  duplicate_column
100   4.967142  42.923146               B             NaN                 2
101  -1.382643  47.896773               B            86.0                 1
102   6.476885  48.286427               A            57.0                 5
103  15.230299  45.988614               A            29.0                 4
104  -2.341534  49.193571               A            78.0                 1


In [None]:
print(df.tail(15))# enables you view the last 15 rows of the data set

     feature_1  feature_2 category_column  missing_values  duplicate_column
90    0.970775  47.767425               C             NaN                 4
91    9.686450  54.281994               C            36.0                 4
92   -7.020531  51.070469               A            10.0                 5
93   -3.276621  43.771306               B            73.0                 1
94   -3.921082  50.865905               A            24.0                 1
95  -14.635149  51.926587               C            64.0                 3
96    2.961203  45.580713               C            99.0                 5
97    2.610553  50.768626               B            49.0                 4
98    0.051135  50.291044               C            99.0                 1
99   -2.345871  44.285149               A            36.0                 4
100   4.967142  42.923146               B             NaN                 2
101  -1.382643  47.896773               B            86.0                 1
102   6.4768

In [None]:
print(df.head(15))# enable you view the first 15 rows of the data set

    feature_1  feature_2 category_column  missing_values  duplicate_column
0    4.967142  42.923146               B             NaN                 2
1   -1.382643  47.896773               B            86.0                 1
2    6.476885  48.286427               A            57.0                 5
3   15.230299  45.988614               A            29.0                 4
4   -2.341534  49.193571               A            78.0                 1
5   -2.341370  52.020254               C            92.0                 5
6   15.792128  59.430930               B            69.0                 1
7    7.674347  50.872889               C            47.0                 3
8   -4.694744  51.287752               A            94.0                 1
9    5.425600  49.627770               B            62.0                 1
10  -4.634177  40.406144               C             NaN                 1
11  -4.657298  49.867431               B            69.0                 4
12   2.419623  50.301151 

In [None]:
print(df.head(30)) #enables you view the first 30 rows of the dataset

    feature_1  feature_2 category_column  missing_values  duplicate_column
0    4.967142  42.923146               B             NaN                 2
1   -1.382643  47.896773               B            86.0                 1
2    6.476885  48.286427               A            57.0                 5
3   15.230299  45.988614               A            29.0                 4
4   -2.341534  49.193571               A            78.0                 1
5   -2.341370  52.020254               C            92.0                 5
6   15.792128  59.430930               B            69.0                 1
7    7.674347  50.872889               C            47.0                 3
8   -4.694744  51.287752               A            94.0                 1
9    5.425600  49.627770               B            62.0                 1
10  -4.634177  40.406144               C             NaN                 1
11  -4.657298  49.867431               B            69.0                 4
12   2.419623  50.301151 

In [None]:
# Step 2: Handling missing values
print("\nChecking for duplicates:")
print(df.duplicated().sum()) #Count duplicate rows
df = df.drop_duplicates() # Remove duplicate rows


Checking for duplicates:
5


In [None]:
# Step 4: Encoding Categorical Variables
print("\nEncoding categorical variables")
categorical_cols = df.select_dtypes(include=['object']).columns #selecting categorical columns
label_encoders = {}
for col in categorical_cols:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col]) # Apply Label Encoding
  label_encoders[col] = le

  print(df.head()) # Display transformed dataset


Encoding categorical variables
   feature_1  feature_2  category_column  missing_values  duplicate_column
0   4.967142  42.923146                1             NaN                 2
1  -1.382643  47.896773                1            86.0                 1
2   6.476885  48.286427                0            57.0                 5
3  15.230299  45.988614                0            29.0                 4
4  -2.341534  49.193571                0            78.0                 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col]) # Apply Label Encoding


In [None]:
# Step 5: Feature Scaling
print("\nApplying feature scaling")
numeric_cols = df.select_dtypes(include=[np.number]).columns # Selecting numerical columns
scaler = StandardScaler() #  Initializing standard scaler
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) # Standardizing numerical features
print(df.head()) # Display scaled dataset


Applying feature scaling
   feature_1  feature_2  category_column  missing_values  duplicate_column
0   0.664619  -1.515115         0.037823             NaN         -0.761078
1  -0.038089  -0.466809         0.037823        1.171394         -1.440611
2   0.831697  -0.384681        -1.222959        0.190535          1.277523
3   1.800406  -0.868997        -1.222959       -0.756502          0.597989
4  -0.144206  -0.193479        -1.222959        0.900812         -1.440611


In [None]:
# Step 6: Splitting Dataset into Training and Testing Sets
print("\nsplitting dataset into training and testing sets:")
x = df.drop(columns=['feature_1']) # Assuming 'feature_1' is the target variable
y = df['feature_1']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(f"Training set size: {x_train.shape}, Testing set size: {x_test.shape}")

print("\nData Preprocessing Completed!")


splitting dataset into training and testing sets:
Training set size: (80, 4), Testing set size: (20, 4)

Data Preprocessing Completed!
