# Data Sciece Intro to Machine Learning Project

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

In [2]:
file_path = r"C:/Users/kevin/datasets/users_behavior.csv"
df = pd.read_csv(file_path)
print(df.head(20))

    calls  minutes  messages   mb_used  is_ultra
0    40.0   311.90      83.0  19915.42         0
1    85.0   516.75      56.0  22696.96         0
2    77.0   467.66      86.0  21060.45         0
3   106.0   745.53      81.0   8437.39         1
4    66.0   418.74       1.0  14502.75         0
5    58.0   344.56      21.0  15823.37         0
6    57.0   431.64      20.0   3738.90         1
7    15.0   132.40       6.0  21911.60         0
8     7.0    43.39       3.0   2538.67         1
9    90.0   665.41      38.0  17358.61         0
10   82.0   560.51      20.0   9619.53         1
11   45.0   344.32      13.0  19898.81         0
12   51.0   437.13      61.0  21523.58         0
13   56.0   433.07      16.0  16702.36         0
14  108.0   587.90       0.0  14406.50         1
15    6.0    22.13       0.0   2710.09         0
16    2.0    18.73       0.0    588.89         0
17   26.0   163.62       4.0  16870.34         0
18   79.0   532.62      90.0  19908.31         0
19   49.0   341.67  

In [3]:
df.shape

(3214, 5)

## Preprocessing 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [5]:
df['calls'] = df['calls'].astype(int)
df['messages'] = df['messages'].astype(int)

In [6]:
df.isna().sum()

calls       0
minutes     0
messages    0
mb_used     0
is_ultra    0
dtype: int64

In [7]:
duplicate = df[df.duplicated()]
duplicate

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra


There are no missing values or duplicates in our data. In machine learning, we can't pass any missing values to our model. The dataset columns messages and minutes have been changed to intergers since all data within those columns are integers. 

# Machine Learning 

In [8]:
features = df.drop(['is_ultra'], axis=1)
target = df['is_ultra']

In [9]:
features

Unnamed: 0,calls,minutes,messages,mb_used
0,40,311.90,83,19915.42
1,85,516.75,56,22696.96
2,77,467.66,86,21060.45
3,106,745.53,81,8437.39
4,66,418.74,1,14502.75
...,...,...,...,...
3209,122,910.98,20,35124.90
3210,25,190.36,0,3275.61
3211,97,634.44,70,13974.06
3212,64,462.32,90,31239.78


The features regarding this model are the calls, minutes, messages, and mb_used. These are the features because they all invovle variables that can affect which plan is better

In [10]:
target

0       0
1       0
2       0
3       1
4       0
       ..
3209    1
3210    0
3211    0
3212    0
3213    1
Name: is_ultra, Length: 3214, dtype: int64

Since we are trying to figure out which plan is better, we will use the plan column as our target

Split into Validation and Training sets to test multiple models

In [11]:
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42)

features_train, features_valid, target_train, target_valid = train_test_split(
    features_train, target_train, test_size=0.25, random_state=42)

In [12]:
print(features_train.shape)
print(target_train.shape)
print(features_valid.shape)
print(target_valid.shape)

(1928, 4)
(1928,)
(643, 4)
(643,)


In [13]:
for depth in range(4,10):
    model = DecisionTreeClassifier(random_state=42, max_depth=depth)
    model.fit(features_train,target_train)
    predictions_valid = model.predict(features_valid)
    print("Accuracy =", model, ": ", end='')
    print(accuracy_score(target_valid, predictions_valid))

Accuracy = DecisionTreeClassifier(max_depth=4, random_state=42) : 0.7807153965785381
Accuracy = DecisionTreeClassifier(max_depth=5, random_state=42) : 0.7713841368584758
Accuracy = DecisionTreeClassifier(max_depth=6, random_state=42) : 0.7791601866251944
Accuracy = DecisionTreeClassifier(max_depth=7, random_state=42) : 0.7884914463452566
Accuracy = DecisionTreeClassifier(max_depth=8, random_state=42) : 0.7713841368584758
Accuracy = DecisionTreeClassifier(max_depth=9, random_state=42) : 0.7853810264385692


In [14]:
for est in range(20,50):
    model = RandomForestClassifier(random_state=42, n_estimators=est)
    model.fit(features_train,target_train)
    predictions_valid = model.predict(features_valid)
    print('Accuracy =', model, ":",end='')
    print(accuracy_score(target_valid, predictions_valid))

Accuracy = RandomForestClassifier(n_estimators=20, random_state=42) :0.776049766718507
Accuracy = RandomForestClassifier(n_estimators=21, random_state=42) :0.7822706065318819
Accuracy = RandomForestClassifier(n_estimators=22, random_state=42) :0.7884914463452566
Accuracy = RandomForestClassifier(n_estimators=23, random_state=42) :0.7838258164852255
Accuracy = RandomForestClassifier(n_estimators=24, random_state=42) :0.7869362363919129
Accuracy = RandomForestClassifier(n_estimators=25, random_state=42) :0.7838258164852255
Accuracy = RandomForestClassifier(n_estimators=26, random_state=42) :0.7822706065318819
Accuracy = RandomForestClassifier(n_estimators=27, random_state=42) :0.7869362363919129
Accuracy = RandomForestClassifier(n_estimators=28, random_state=42) :0.7869362363919129
Accuracy = RandomForestClassifier(n_estimators=29, random_state=42) :0.7822706065318819
Accuracy = RandomForestClassifier(n_estimators=30, random_state=42) :0.7869362363919129
Accuracy = RandomForestClassifier

In [15]:
model = LogisticRegression(random_state=42,solver='liblinear')
model.fit(features_train,target_train)
print("Logistic Regression Accuracy on the Training Set:",model.score(features_valid,target_valid))

Logistic Regression Accuracy on the Training Set: 0.7216174183514774


The model I would choose to figure out what phone plan is the best is DecisionTreeClassifier. 

We are trying to figure out which phone plan is the best, and DecisionTreeClassifier has a accuracy on near 81% at a depth of 4. Which is very fast! The reason I would not use the LogisticModel is because of the lower accuracy of 72%, and the RandomForestClassifier gives us great accuracy, but only when we are slowing down the model(creating more trees).

If we compare the models at a estimate of 4, we see the DecisionTreeClassifier has a better accuracy for how fast the model is. 

In [16]:
model = RandomForestClassifier()

In [17]:
model.fit(features_train,target_train)

RandomForestClassifier()

In [18]:
model_predictions =  model.predict(features_test)

In [19]:
print(accuracy_score(target_test, model_predictions))

0.7993779160186625


RandomForest gave us the best accuracy and is recommended to find the best plan, Surf or Ultra. 