# Installing the scikit-learn library for model building

In [1]:
# Since i have already created the sklearn-virtual nvironment i am going t activate it
# ! source sklearn-env/bin/activate

In [2]:
# Installing the scikit-learn library
# ! pip install -U scikit-learn

In [3]:
# Checking my installation
# ! python -m pip show scikit-learn # showing scikit-learn version and location

In [4]:
# showing all the installed packages in the environment
# ! python -m pip freeze

In [5]:
# Command to check and display scikit-learn version and dependencies
# ! python -c "import sklearn; sklearn.show_versions()"

# Data Preprocessing

In [6]:
# Importing necessary libraries

# Pandas a library for data manipulation and analysis, providing data structures like DataFrames
import pandas as pd



In [44]:
# Importing data

traindf = pd.read_csv('train.csv')


# Data exploration

In [45]:
traindf.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


### Data Overview
Here is a brief description of the columns train.csv dataset:

- **ID_code**:This is a unique identifier for each row, which likely has no predictive power and should be removed before training the model.
- **target**: This is the target variable you are trying to predict. It has binary values (0 and 1), indicating a classification problem.
- **var_0** to **var_199**: These are the feature columns that will be used to predict the target. Each row contains 200 features.

In [50]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 201 entries, target to var_199
dtypes: float64(200), int64(1)
memory usage: 306.7 MB


Based on the output of the `.info()` method, here is a brief description of the data:

### Data Overview
- **Number of Entries**: 200,000
- **Number of Columns**: 202
  - **ID_code**: Object (likely a unique identifier)
  - **target**: Integer (binary target variable with values 0 and 1)
  - **var_0 to var_199**: Float64 (features)

### Memory Usage
- **Memory Usage**: 308.2+ MB

## Reasoning:
The **ID_code** column typically contains unique identifiers for each row.
Such identifiers are not relevant for the modeling process as they do not provide
any predictive value. Including them might lead to overfitting or reduce model performance.
Therefore, it is common practice to remove these fields before training a machine learning model.


In [48]:
# The following line of code removes the 'ID_code' column from the DataFrame.
traindf = traindf.drop('ID_code', axis=1)

# Dataset Overview

Removed the 'ID_code' column, preparing the data for modeling.

The .info() method shows that the dataset has 2000 entries with no missing values 
and the correct data types for each column. This means we can proceed directly to:

1. Splitting the data into training and test sets
2. Feature scaling



# Splitting into Training and Test Sets

 This separation is essential for supervised learning tasks, where X represents the
 input features that are used to predict the target variable y.


In [11]:
# Importing the train_test_split function from the model_selection module of scikit-learn
from sklearn.model_selection import train_test_split

In [13]:
# Separating features and target
X = traindf.drop('target', axis = 1)
y = traindf['target']

## Explanation:
- traindf: the DataFrame containing both features and the target variable.
- X: features DataFrame, obtained by dropping the 'target' column from traindf.
- y: target DataFrame, containing only the **target** column from traindf.

In [51]:
X


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,-4.9200,5.7470,...,4.4354,3.9642,3.1364,1.6910,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.3560,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,-5.8609,8.2450,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,11.4880,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,-0.6209,5.6669,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.6660,-17.8661
199996,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.5940,-3.6797,6.0019,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
199997,11.2232,-5.0518,10.5127,5.6456,9.3410,-5.4086,4.5555,21.5571,0.1202,6.1629,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
199998,9.7148,-8.6098,13.6104,5.7930,12.5173,0.5339,6.0479,17.0152,-2.1926,8.7542,...,2.6840,8.6587,2.7337,11.1178,20.4158,-0.0786,6.7980,10.0342,15.5289,-13.9001


In [52]:
y

0         0
1         0
2         0
3         0
4         0
         ..
199995    0
199996    0
199997    0
199998    0
199999    0
Name: target, Length: 200000, dtype: int64

In [53]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Explanation:
 - X_train: features for the training set (80% of the original data, as test_size is 0.2)
 - X_test: features for the testing set (20% of the original data)
 - y_train: target variable for the training set (corresponding to X_train)
 - y_test: target variable for the testing set (corresponding to X_test)
 Using train_test_split helps to evaluate the performance of a machine learning model by training it on
 the training set and testing it on the testing set. The random_state parameter ensures that the split
 is reproducible, meaning the same data will be used for training and testing each time the code is run.

# Feature scaling
Standardizing the features to have mean = 0 and variance = 1

In [19]:
# Importing the StandardScaler class from the preprocessing module of scikit-learn.
from sklearn.preprocessing import StandardScaler

### Explanation of StandardScaler in scikit-learn:

#### Purpose:
`StandardScaler` is used to standardize features by removing the mean and scaling to unit variance. This transformation is an essential preprocessing step before applying many machine learning algorithms.

#### Why Standardization?
Many machine learning algorithms perform better or converge faster when features are on a similar scale and close to normally distributed. Standardization ensures that:
- Each feature has a mean of 0.
- Each feature has a standard deviation of 1.

#### How it works:
1. **Compute Mean and Standard Deviation:**
   - The `StandardScaler` first computes the mean (average value) and standard deviation (dispersion or spread) for each feature in the training set.
   - Mean and standard deviation are calculated using the formula:
         $$
     \text{mean} = \frac{1}{n} \sum_{i=1}^{n} x_i
     $$
     $$
     \text{standard deviation} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (x_i - \text{mean})^2}
     $$

   - Here, $( x_i )$ represents each data value, and \( n \) is the number of data values.

2. **Transform Features:**
   - After computing the mean and standard deviation, `StandardScaler` transforms (standardizes) each feature using the formula:
     $$
      $$\$$
     x_{\text{scaled}} = \frac{x - \text{mean}}{\text{standard deviation}}
     $$
     $$
     \
     $$
   - This transformation centers the distribution of each feature at 0 (subtracting the mean) and scales it to have a standard deviation of 1.

#### Benefits:
- **Improved Model Performance:** Standardization can improve the performance of many machine learning algorithms. Algorithms like gradient descent-based optimization methods (e.g., SVM, logistic regression, neural networks) often perform better when features are standardized.
- **Interpretability:** Standardized features are easier to interpret because they are on a common scale.
- 
#### Summary:
`StandardScaler` in scikit-learn is a transformer that standardizes features by removing the mean and scaling to unit variance. It is a critical preprocessing step that can improve the performance and convergence of many machine learning algorithms, especially those that use gradient descent optimization.

This explanation provides a detailed overview of how `'StandardScaler` works and why it is important in the context of machine learning with scikit-learn.
  ## Scikit-learn methods for feature scaling
  The **fit()** method will compute the mean and standard deviation for later scaling.
  The **transform()** method will then use these computed values to standardize the dataset.
  Alternatively, **fit_transform()** can be used to perform both steps in one go.

In [20]:
# Initialize the scaler
scaler = StandardScaler()

In [21]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the scaler fitted on the training data
X_test_scaled = scaler.transform(X_test)


In [22]:
X_train_scaled

array([[ 0.37380309,  1.10624484, -1.26396926, ..., -0.58960701,
         0.11491664,  1.06169989],
       [-0.42435356, -0.23186475, -0.04358289, ...,  0.80125821,
        -0.63501785, -1.663496  ],
       [ 0.18377987, -0.87532526,  0.76439419, ..., -0.24552348,
        -0.23781777,  0.9701959 ],
       ...,
       [ 1.0517584 ,  2.3220103 , -0.65021854, ...,  0.19387975,
        -1.64549858,  0.04624676],
       [-0.35899819,  0.65349506, -0.5752832 , ..., -0.79369311,
        -0.97987605, -0.57043386],
       [ 0.45413438,  0.50348344, -0.07450318, ...,  0.54902427,
         0.93324452,  0.44033976]])

In [23]:
X_test_scaled

array([[-0.12276641, -0.44903359, -0.87525164, ..., -1.45149667,
         0.2478484 ,  0.60887859],
       [-0.8419383 ,  1.2219004 , -1.43620701, ...,  0.85211707,
         0.79595925, -0.70265627],
       [ 1.2451315 ,  0.1524444 , -1.95723471, ...,  1.56717752,
         2.57807558,  1.09771009],
       ...,
       [-0.61207786,  0.77976348,  0.71852467, ..., -1.66046262,
        -0.56755498, -1.29925998],
       [ 0.13835952,  1.22663917,  1.42268981, ..., -1.10481055,
        -0.42960506,  0.55767195],
       [ 0.88157169, -0.91185326,  0.94931039, ...,  1.01618842,
         0.55196302, -1.90047622]])

### Decision Point: Choosing the Machine Learning Model

After completing feature scaling, we now have a clear picture of our prepared data. The next step is to decide which type of machine learning model is appropriate based on the nature of our data and the problem we are trying to solve.

Here are some considerations:

#### 1. Nature of Data:
- **Supervised Learning**:
  - If we have labeled data and we are predicting an outcome.
  - **Classification**: Predicting categorical labels (e.g., spam/not spam, disease diagnosis).
    - Typical algorithms include: Logistic Regression, Decision Trees, Random Forest, Support Vector Machines.
  - **Regression**: Predicting continuous values (e.g., house prices, stock prices).
    - Typical algorithms include: Linear Regression, Ridge Regression, Lasso Regression.

- **Unsupervised Learning**:
  - If we do not have labeled outcomes and we are exploring the data.
  - **Clustering**: Grouping similar data points together (e.g., customer segmentation, anomaly detection).
    - Typical algorithms include: K-Means, DBSCAN, Hierarchical Clustering.
  - **Dimensionality Reduction**: Reducing the number of input variables.
    - Typical algorithms include: Principal Component Analysis (PCA), t-Distributed Stochastic Neighbor Embedding (t-SNE).

- **Reinforcement Learning**:
  - If we are dealing with tasks where an agent learns to take actions in an environment to maximize some cumulative reward.
  - Algorithms include: Q-Learning, Deep Q Networks (DQN), Policy Gradient Methods.

#### 2. Problem You Are Trying to Solve:
- **Classification**:
  - Predicting a categorical label or class based on input features.
  - **Decision**: We are going to use classification. We have labeled data and we want to predict categories.

### Scikit-Learn Classification Chart

Based on our decision to use classification with labeled data, let's visualize some of the common classification algorithms available in scikit-learn:

![alt text](ml_map.svg)

# Modelling
### Building the model

# Using LinearSVC

##  Buiding the model

In [32]:
from sklearn.svm import LinearSVC

In [33]:
# Initializing the model
SVCModel = LinearSVC()

## 6 Training the model
Fitting the model

In [34]:
SVCModel.fit(X_train_scaled, y_train)

## 7. Making Predictions

In [35]:
y_pred = SVCModel.predict(X_test_scaled)

# Evaluating

In [36]:
new_accuracy = accuracy_score(y_test, y_pred)

In [37]:
print(f"Accuracy: {new_accuracy:.2f}")

Accuracy: 0.91


# Using Gaussian Naive Bayes 

### Modelling

### 5. Building the model

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
gaus_model = GaussianNB()

### 6 Training the model

In [43]:
gaus_model.fit(X_train_scaled, y_train)

In [44]:
new_y_pred = gaus_model.predict(X_test_scaled)

In [45]:
gaus_accuracy = accuracy_score(y_test, new_y_pred)

In [46]:
print(f"Accuracy: {gaus_accuracy:.2f}")

Accuracy: 0.92


# Using SGD Classifier


In [56]:
from sklearn.linear_model import SGDClassifier

In [84]:
sgd_classifier = SGDClassifier(loss='hinge', penalty='l2', max_iter=100000)

### Training the model

In [85]:
sgd_classifier.fit(X_train_scaled, y_train)

In [86]:
sgd_y_pred = sgd_classifier.predict(X_test_scaled)

In [87]:
sgd_accuracy = accuracy_score(y_test, sgd_y_pred)

In [95]:
print(f"Accuracy: {sgd_accuracy:.2f}")

Accuracy: 0.90


In [101]:
score = sgd_classifier.score(X_test_scaled, y_test)

In [102]:
from sklearn.metrics import mean_absolute_error

In [103]:
mae = mean_absolute_error(y_test, sgd_y_pred)

In [108]:
print(f"Accuracy score is : {score}")
print(f"Mean absolute error is : {mae}")

Accuracy score is : 0.897575
Mean absolute error is : 0.102425


# Using Kernel-approximation

In [62]:
from sklearn.kernel_approximation import RBFSampler

In [63]:
kapprox = RBFSampler()

# Training the model

In [64]:
kapprox.fit(X_train_scaled, y_train)

In [67]:
kapprox_y_predict = kapprox.predict(X_test_scaled)

AttributeError: 'RBFSampler' object has no attribute 'predict'