# Manual Feature Extraction techniques

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [13]:
# Load your dataset
data = pd.read_csv('/Users/khalidhameed/Downloads/WineQT.csv')
print(data.head())  # Display the first few rows of the dataset
print(data.info())  # Check data types and missing values


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Id  
0      9.4        5   0  
1      9.8        5   1  
2      9

In [14]:
from sklearn.datasets import load_iris

# Load the Iris dataset
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Display the first few rows of the dataset
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [15]:
# Create new features by combining existing ones
df['sepal_length_sepal_width_ratio'] = df['sepal length (cm)'] / (df['sepal width (cm)'] + 1e-5)
df['petal_length_petal_width_ratio'] = df['petal length (cm)'] / (df['petal width (cm)'] + 1e-5)

# Display the updated dataset
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  sepal_length_sepal_width_ratio  petal_length_petal_width_ratio  
0       0                        1.457139                        6.999650  
1       0                        1.633328                        6.999650  
2       0                        1.468745                        6.499675  
3       0                        1.483866                        7.499625  
4       0                        1.388885                        6.999650  


In [16]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop(columns=['target'])
y = df['target']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0


# Automatic Feature Extraction technique

### 1. Principal Component Analysis (PCA)

In [18]:
# Import necessary libraries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import load_iris

# Load your dataset (use your own dataset here, e.g., load_iris)
data = load_iris()  # Replace with your own dataset loading code
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

# Standardize the features before PCA (important for PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce the features (let's reduce to 2 components for visualization)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Display the result of PCA
print("Original shape of data:", X.shape)
print("Transformed shape of data:", X_pca.shape)

# Create a new DataFrame for PCA components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['target'] = y

# Display the new DataFrame with PCA components
print(pca_df.head())


Original shape of data: (150, 4)
Transformed shape of data: (150, 2)
        PC1       PC2  target
0 -2.264703  0.480027       0
1 -2.080961 -0.674134       0
2 -2.364229 -0.341908       0
3 -2.299384 -0.597395       0
4 -2.389842  0.646835       0


In [19]:
# Import RFE and RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load your dataset (use your own dataset here, e.g., load_iris)
data = load_iris()  # Replace with your own dataset loading code
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

# Initialize the model (Random Forest in this case)
model = RandomForestClassifier()

# Apply RFE to select the top 2 features (you can change the number of features to select)
rfe = RFE(model, n_features_to_select=2)
X_rfe = rfe.fit_transform(X, y)

# Display the features selected by RFE
selected_features = X.columns[rfe.support_]
print("Selected Features by RFE:", selected_features)


Selected Features by RFE: Index(['petal length (cm)', 'petal width (cm)'], dtype='object')


### 2. Recursive Feature Elimination (RFE)

In [20]:
# Import RFE and RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load your dataset (use your own dataset here, e.g., load_iris)
data = load_iris()  # Replace with your own dataset loading code
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

# Initialize the model (Random Forest in this case)
model = RandomForestClassifier()

# Apply RFE to select the top 2 features (you can change the number of features to select)
rfe = RFE(model, n_features_to_select=2)
X_rfe = rfe.fit_transform(X, y)

# Display the features selected by RFE
selected_features = X.columns[rfe.support_]
print("Selected Features by RFE:", selected_features)


Selected Features by RFE: Index(['petal length (cm)', 'petal width (cm)'], dtype='object')
