<a href="https://colab.research.google.com/github/Lucid-Lifo/Data-Analysis-Using-Python/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Dataset

In [None]:
#Importing necessory packages
import pandas as pd
import numpy as np

df=pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ', sep=';')


In [None]:
df

## Exporting Data

In [None]:
df.to_csv("output.csv", index=False)

##Basic statistical insights from datasets


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.groupby('quality').size()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset1 = pd.read_csv('/content/drive/MyDrive/Lectures/2025-2026 First Half/MCA/Module 4/Placement_Dataset.csv')

In [None]:
dataset1.head(20)

In [None]:
dataset1.shape

# Data cleaning and Preprocessing

## Handling Missing Values

In [None]:
dataset1.isnull().sum() #Checking for missing values

#### impute methods
1. Mean
2. Median
3. Mode

In [None]:
dataset1['salary'].fillna(dataset1['salary'].median(),inplace=True)   #Replace the missing values with Median value

In [None]:
dataset1.isnull().sum()

In [None]:
# filling missing values with Mean value:
# dataset['salary'].fillna(dataset['salary'].mean(),inplace=True)

In [None]:
# filling missing values with Mean value:
# dataset['salary'].fillna(dataset['salary'].mode(),inplace=True)

##### Drop method

In [None]:
salary_dataset = pd.read_csv('/content/drive/MyDrive/Lectures/2025-2026 First Half/MCA/Module 4/Placement_Dataset.csv')

salary_dataset.shape

In [None]:
salary_dataset.isnull().sum()

In [None]:
salary_dataset = salary_dataset.dropna(how='any')   # drop the row with missing values

In [None]:
salary_dataset.isnull().sum()

In [None]:
salary_dataset.shape

# Transformation

##### Use of Scikit library

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ',sep=';')

df.head(10)


In [None]:
df['fixed acidity'].corr(df['volatile acidity'])   #correlation between two features

In [None]:
array=df.values
#Separating data into input and output components
x=array[:,0:8]

y=array[:,8]

In [None]:
print(array)

In [None]:
x

In [None]:
y

### Min-Max Scaler

In [None]:
scaler=MinMaxScaler(feature_range=(0,1))

rescaledX = scaler.fit_transform(x)

rescaledX[0:5,:]

### Standardizing Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler().fit(x)     # removing the mean and scaling to unit variance

rescaledX=scaler.transform(x)
rescaledX[0:5,:]

### Robust Scaling

In [None]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()          #leverages robust statistics: the median and the Interquartile Range (IQR).
transformed = robust_scaler.fit_transform(x)


In [None]:
transformed

### Normalizing Data

### Unlike StandardScaler or MinMaxScaler which operate on features (columns), Normalizer works on samples (rows).

In [None]:
from sklearn.preprocessing import Normalizer

scaler=Normalizer().fit(x)        #Normalizes the sum of the squares of the components to 1
normalizedX=scaler.transform(x)
normalizedX[0:5,:]

### Binarizing Data

### convert numerical feature values into binary (0 or 1) values based on a specified threshold.

In [None]:
from sklearn.preprocessing import Binarizer

binarizer=Binarizer(threshold=0.0).fit(x)
binaryX=binarizer.transform(x)
binaryX[0:5,:]

### Label Encoding

### Label encoding in scikit-learn is a technique used to convert categorical labels into numerical format, which is necessary for many machine learning algorithms that require numerical input.

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()

input_classes=['Havells','Philips','Syska','Eveready','Lloyd']
label_encoder.fit(input_classes)

In [None]:
for i,item in enumerate(label_encoder.classes_):
    print(item,'-->',i)

In [None]:
labels=['Lloyd','Syska','Philips']
label_encoder.transform(labels)

In [None]:
label_encoder.inverse_transform(label_encoder.transform(labels))

### One hot encoding


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_data = ["apple", "mango", "apple", "berry", "mango", "apple", "berry", "apple"]

label = LabelEncoder()

int_data = label.fit_transform(cat_data)
int_data = int_data.reshape(len(int_data), 1)
print(int_data)

onehot_data = OneHotEncoder(sparse_output=False)
onehot_data = onehot_data.fit_transform(int_data)
print("\nCategorical data encoded into integer values....\n")
print(onehot_data)

In [None]:
#create DataFrame
df = pd.DataFrame({'points': [25, 12, 15, 14, 19, 23, 25, 29],
                   'assists': [5, 7, 7, 9, 12, 9, 9, 4],
                   'rebounds': [11, 8, 10, 6, 6, 5, 9, 12]})

df.head()

In [None]:
#calculate correlation between points and assists
df['points'].corr(df['assists'])

## Example- Preprocessing Stages

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

## 2. Load a Built-in Dataset (Iris)

In [None]:
iris = datasets.load_iris()
X = iris.data  # features
y = iris.target  # labels

print('Feature names:', iris.feature_names)
print('Target names:', iris.target_names)
print('Shape of X:', X.shape)

## 3. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

## 4. Data Preprocessing

In [None]:
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Mean of scaled training data:", X_train_scaled.mean(axis=0))