In [None]:
# 0. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# 1. Read File
df = pd.read_csv("data.csv")
df.head()


In [None]:
# 2. Check Types
print(df.dtypes)


In [None]:
# 3. Handle Types (مثال: تحويل عمود لتاريخ)
df["date"] = pd.to_datetime(df["date"], errors="coerce")


In [None]:
# 4. Check Nulls
print(df.isnull().sum())


In [None]:
# 5. Handle Nulls (تعويض بالقيمة المتوسطة)
df["column_name"].fillna(df["column_name"].mean(), inplace=True)


In [None]:
# 6. Check Outliers (Boxplot)
sns.boxplot(x=df["column_name"])
plt.show()


In [None]:
# 7. Handle Outliers (IQR method)
Q1 = df["column_name"].quantile(0.25)
Q3 = df["column_name"].quantile(0.75)
IQR = Q3 - Q1
df = df[(df["column_name"] >= Q1 - 1.5*IQR) & (df["column_name"] <= Q3 + 1.5*IQR)]


In [None]:
# 8. Check Duplicates
print(df.duplicated().sum())


In [None]:
# 9. Handle Duplicates
df = df.drop_duplicates()


In [None]:
# 10. Data Visualization (Univariate)
df["column_name"].hist()
plt.show()


In [None]:
# 11. Data Visualization (Bivariate)
sns.scatterplot(x=df["col1"], y=df["col2"])
plt.show()


In [None]:
# 12. Data Visualization (Multivariate)
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()


In [None]:
# 13. Splitting (X, y)
X = df.drop("target", axis=1)
y = df["target"]


In [None]:
# 14. Normalization
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# 15. Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# 16. Data Encoding (OneHot)
df = pd.get_dummies(df, columns=["categorical_column"], drop_first=True)


In [None]:
# 17. Save Dataset
df.to_csv("clean_data.csv", index=False)


In [None]:
# 15. Drop Unimportant Features
df = df.drop(["ID"], axis=1)
