# EDA and Feature Engineering Steps

## Step-1 Loading differnet types of files 

In [None]:
import pandas as pd

# Load CSV file
df = pd.read_csv('filename.csv')
# Load Excel file
df = pd.read_excel('filename.xlsx') 
# Load JSON file
df = pd.read_json('filename.json')
# Load HTML file
df_list = pd.read_html('filename.html') 
# Load Parquet file
df = pd.read_parquet('filename.parquet')
# Load Feather file
df = pd.read_feather('filename.feather')
# Load Pickle file
df = pd.read_pickle('filename.pkl')
# Load Stata file
df = pd.read_stata('filename.dta')
# Load SPSS file
df = pd.read_spss('filename.sav')

These are the some of the methods to load the data

## Step-2 Checking/Understanding the data

In [None]:
df.head()  # First few rows
df.tail() # Last few rows
df.info()  # Data types and non-null values
df.describe()  # Summary statistics
df.shape  # Dimensions of the dataset
df.dtypes # Checking data types

These are the some steps to analyze the data 

## Step-3 Data cleaning

### Handling missing values

In [None]:
# Identify missing values
missing_values = data.isnull().sum()
print(missing_values)

# Drop rows with missing values
data_cleaned = data.dropna()

# Fill missing values with the mean (for numerical columns)
data_filled = data.fillna(data.mean())

df.fillna(df.mean(), inplace=True)  # For numerical columns
df.fillna('Unknown', inplace=True)  # For categorical columns



### Removing duplicates

In [None]:
# Identify and remove duplicate rows
data_cleaned = data.drop_duplicates()

### Handling Outliers

In [None]:
import matplotlib.pyplot as plt

# Visualize potential outliers using a box plot
plt.boxplot(data['numerical_column'])
plt.show()

# Remove outliers (example using Z-score method)
from scipy import stats
data = data[(np.abs(stats.zscore(data['numerical_column'])) < 3)]


These are the some steps to clean the data 

## Step-4 Data Transformation

### Feature Engineering

In [None]:
# Create a new feature (example: total sales = price * quantity)
data['total_sales'] = data['price'] * data['quantity']

# Polynomial Features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Interaction Features
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

interaction = FunctionTransformer(lambda X: X[:, 0] * X[:, 1], validate=True)
interaction_pipe = Pipeline([('interaction', interaction)])
X_interaction = interaction_pipe.fit_transform(X)

# Binning
df['binned_column'] = pd.cut(df['numerical_column'], bins=5, labels=False)


### Scaling and Normalization

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standardize numerical features
scaler = StandardScaler()
data[['numerical_column1', 'numerical_column2']] = scaler.fit_transform(data[['numerical_column1', 'numerical_column2']])

# Normalize numerical features
scaler = MinMaxScaler()
data[['numerical_column1', 'numerical_column2']] = scaler.fit_transform(data[['numerical_column1', 'numerical_column2']])

# Min-Max scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['numerical_column1', 'numerical_column2']] = scaler.fit_transform(df[['numerical_column1', 'numerical_column2']])


### Encoding Categorical Variables

In [None]:
# One-hot encoding for categorical variables
data_encoded = pd.get_dummies(data, columns=['categorical_column'])


These are the some methods to transform the data

# Step-5 Data visualization

In [None]:
# Univariate Analysis

## Histograms
import matplotlib.pyplot as plt

df['numerical_column'].hist(bins=30)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Numerical Column')
plt.show()

## Bar plots for categorical data
df['categorical_column'].value_counts().plot(kind='bar')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.title('Bar Plot of Categorical Column')
plt.show()

# Bivariate Analysis

## Scatter plots
df.plot.scatter(x='column1', y='column2')
plt.xlabel('Column 1')
plt.ylabel('Column 2')
plt.title('Scatter Plot of Column 1 vs Column 2')
plt.show()

## Box plots
df.boxplot(column='numerical_column', by='categorical_column')
plt.xlabel('Category')
plt.ylabel('Value')
plt.title('Box Plot of Numerical Column by Category')
plt.suptitle('')
plt.show()

# Multivariate Analysis

## Pair plots
import seaborn as sns

sns.pairplot(df[['column1', 'column2', 'column3']])
plt.show()

## Heatmaps for correlation
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Heatmap of Correlation Matrix')
plt.show()

## Distribution Plots
sns.distplot(df['numerical_column'])
plt.title('Distribution Plot of Numerical Column')
plt.show()

## KDE (Kernel Density Estimate) Plots
sns.kdeplot(df['numerical_column'])
plt.title('KDE Plot of Numerical Column')
plt.show()

## Violin Plots
sns.violinplot(x='categorical_column', y='numerical_column', data=df)
plt.title('Violin Plot of Numerical Column by Category')
plt.show()

## Facet Grids
g = sns.FacetGrid(df, col='categorical_column')
g.map(plt.hist, 'numerical_column')
plt.show()




These are the some methods to visualize the data

# Step-6 Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('target_column', axis=1)
y = df['target_column']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


This method is used to split the train data and test data