# Installing the scikit-learn library for model building

In [1]:
# Since i have already created the sklearn-virtual nvironment i am going t activate it
# ! source sklearn-env/bin/activate

In [2]:
# Installing the scikit-learn library
# ! pip install -U scikit-learn

In [3]:
# Checking my installation
# ! python -m pip show scikit-learn # showing scikit-learn version and location

In [4]:
# showing all the installed packages in the environment
# ! python -m pip freeze

In [5]:
# Command to check and display scikit-learn version and dependencies
# ! python -c "import sklearn; sklearn.show_versions()"

# Data Preprocessing

In [47]:
# Importing necessary libraries

# Pandas a library for data manipulation and analysis, providing data structures like DataFrames
import pandas as pd



In [48]:
# Importing data

traindf = pd.read_csv('train.csv')


# Data exploration

In [49]:
traindf.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


### Data Overview
Here is a brief description of the columns train.csv dataset:

- **ID_code**:This is a unique identifier for each row, which likely has no predictive power and should be removed before training the model.
- **target**: This is the target variable you are trying to predict. It has binary values (0 and 1), indicating a classification problem.
- **var_0** to **var_199**: These are the feature columns that will be used to predict the target. Each row contains 200 features.

In [51]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


Based on the output of the `.info()` method, here is a brief description of the data:

### Data Overview
- **Number of Entries**: 200,000
- **Number of Columns**: 202
  - **ID_code**: Object (likely a unique identifier)
  - **target**: Integer (binary target variable with values 0 and 1)
  - **var_0 to var_199**: Float64 (features)

### Memory Usage
- **Memory Usage**: 308.2+ MB

In [11]:
# Count the instances of 0s and 1s in the target column
value_counts = traindf['target'].value_counts()

# Get the counts for 0 and 1
count_0 = value_counts.get(0, 0)
count_1 = value_counts.get(1, 0)

# Print the counts
print(f"Count of 0s: {count_0}")
print(f"Count of 1s: {count_1}")

Count of 0s: 179902
Count of 1s: 20098


# Dataset Overview

The `.info()` method shows that the dataset has 2000 entries with no missing values and the correct data types for each column. This means we can proceed directly to:

1. Splitting the data into training and test sets
2. Feature scaling


# Splitting into Training and Test Sets

In [12]:
# Importing the train_test_split function from the model_selection module of scikit-learn
from sklearn.model_selection import train_test_split

In [56]:
# The following line of code removes the 'ID_code' column from the DataFrame.

traindf = traindf.drop('ID_code', axis=1)

# Explanation:
# - 'ID_code': the name of the column to be dropped.
# - axis=1: specifies that a column (as opposed to a row) is to be dropped.
# 
# Reasoning:
# The 'ID_code' column typically contains unique identifiers for each row (e.g., IDs).
# Such identifiers are not relevant for the modeling process as they do not provide
# any predictive value. Including them might lead to overfitting or reduce model performance.
# Therefore, it is common practice to remove these fields before training a machine learning model.


In [14]:
# Separating features and target
X = traindf.drop('target', axis = 1)
y = traindf['target']

In [15]:
X


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,-4.9200,5.7470,...,4.4354,3.9642,3.1364,1.6910,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.3560,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,-5.8609,8.2450,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,11.4880,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,-0.6209,5.6669,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.6660,-17.8661
199996,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.5940,-3.6797,6.0019,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
199997,11.2232,-5.0518,10.5127,5.6456,9.3410,-5.4086,4.5555,21.5571,0.1202,6.1629,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
199998,9.7148,-8.6098,13.6104,5.7930,12.5173,0.5339,6.0479,17.0152,-2.1926,8.7542,...,2.6840,8.6587,2.7337,11.1178,20.4158,-0.0786,6.7980,10.0342,15.5289,-13.9001


In [16]:
y

0         0
1         0
2         0
3         0
4         0
         ..
199995    0
199996    0
199997    0
199998    0
199999    0
Name: target, Length: 200000, dtype: int64

In [17]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160000 entries, 153248 to 121958
Columns: 200 entries, var_0 to var_199
dtypes: float64(200)
memory usage: 245.4 MB


In [19]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 160000 entries, 153248 to 121958
Series name: target
Non-Null Count   Dtype
--------------   -----
160000 non-null  int64
dtypes: int64(1)
memory usage: 2.4 MB


In [20]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40000 entries, 119737 to 6584
Columns: 200 entries, var_0 to var_199
dtypes: float64(200)
memory usage: 61.3 MB


In [21]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 40000 entries, 119737 to 6584
Series name: target
Non-Null Count  Dtype
--------------  -----
40000 non-null  int64
dtypes: int64(1)
memory usage: 625.0 KB


# Handling class imbalance with SMOTE

In [55]:
! pip install imbalance-learn

[31mERROR: Could not find a version that satisfies the requirement imbalance-learn (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for imbalance-learn[0m[31m
[0m

In [53]:
from imblearn.over_sampling import SMOTE

ImportError: cannot import name 'parse_version' from 'sklearn.utils' (/opt/anaconda3/lib/python3.11/site-packages/sklearn/utils/__init__.py)

# Feature scaling
Standardizing the features to have mean = 0 and variance = 1

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
# Initialize the scaler
scaler = StandardScaler()

In [24]:
# Fitting and transforming the training data, transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
X_train_scaled

array([[ 0.53357123, -1.67047693, -0.57711975, ..., -0.80036778,
        -0.98286176,  0.13465476],
       [ 1.55485809,  1.08933341, -0.53895769, ..., -0.47611817,
         2.05104482,  0.461005  ],
       [-0.34230649, -0.74070007, -1.42350658, ...,  0.45688676,
        -0.98505235, -0.58394806],
       ...,
       [ 2.00546841,  0.65097615, -0.18420587, ..., -1.02666926,
        -0.8142192 , -0.70495583],
       [-0.23211415,  2.24198202,  0.82786571, ..., -0.91851366,
         1.82318983,  0.06230493],
       [ 0.30347725, -0.44077922, -0.64825202, ...,  1.79428069,
        -1.15681485,  0.33224683]])

In [26]:
X_test_scaled

array([[ 0.1056708 , -0.71019169, -0.62570343, ..., -0.07976482,
         0.3917361 ,  1.16451898],
       [ 0.71241986, -1.12574794, -1.45424088, ...,  0.90927253,
        -0.34257763, -0.03642883],
       [ 0.82280968,  0.19596898,  1.04277839, ..., -0.2886919 ,
         1.60804693, -0.19947462],
       ...,
       [ 1.19564685,  1.39602292,  0.50483351, ..., -0.07216352,
         1.19080511, -1.26336778],
       [ 1.54172585, -0.00279251,  0.92222871, ..., -1.98084923,
        -1.08186998,  0.41541273],
       [-0.28181261,  0.42489299,  0.87959483, ...,  0.10049451,
        -0.53415505, -1.21478647]])

# Modelling
## 5. Building the model

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
# Initializing the model
model = LogisticRegression(max_iter = 1000)

## 6 Training the model

In [29]:
# Training the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Make Predictions
Using the trained model to make predictions on the test data

In [30]:
y_pred = model.predict(X_test_scaled)



In [31]:
print(y_pred)

[0 1 0 ... 1 0 1]


# Evaluation
#### 8. Calculating perfomance metrics
    Evaluating the model using accuracy and other relavant metrics

In [1]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

ModuleNotFoundError: No module named 'sklearn'

In [33]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [34]:
print(f"Accuracy : {accuracy:.2f}")

Accuracy : 0.52


# Using LinearSVC

## Modelling

## 5. Buiding the model

In [35]:
from sklearn.svm import LinearSVC

In [36]:
# Initializing the model
SVCModel = LinearSVC()

## 6 Training the model
Fitting the model

In [37]:
SVCModel.fit(X_train_scaled, y_train)

## 7. Making Predictions

In [38]:
y_pred = SVCModel.predict(X_test_scaled)

# Evaluating

In [39]:
new_accuracy = accuracy_score(y_test, y_pred)

In [40]:
print(f"Accuracy: {new_accuracy:.2f}")

Accuracy: 0.91


# Using Gaussian Naive Bayes 

### Modelling

### 5. Building the model

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
gaus_model = GaussianNB()

### 6 Training the model

In [43]:
gaus_model.fit(X_train_scaled, y_train)

In [44]:
new_y_pred = gaus_model.predict(X_test_scaled)

In [45]:
gaus_accuracy = accuracy_score(y_test, new_y_pred)

In [46]:
print(f"Accuracy: {gaus_accuracy:.2f}")

Accuracy: 0.92
