# Part 1: Data Preprocessing

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Get the dataset

In [None]:
df = pd.read_csv('Data.csv')

# Or any other way of getting the dataframe
# yfinance.Ticker.history() returns a pandas dataframe, etc.

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# X -> matrix of variables
# y -> Vector used for training + Prediction evaluataion -> Might not always be there

## Fill in Missing Values

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')  # Most common missing values is a np.nan but may vary between datasets
imputer.fit(X[:, 1:3])  # Rows having missing values, ONLY NUMERIC VALUES
X[:, 1:3] = imputer.transform(X[:, 1:3])



## Processing Textual Data

ONE HOT ENCODE

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')  # Meaning, do nothing with the other rows
X = np.array(ct.fit_transform(X))

LABEL ENCODER

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

## Splitting into Training and Test Data Set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Standardisation: Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# Remember, fit_transform() the training data and transform() the test data
X_test[:, 3:] = sc.transform(X_test[:, 3:])

# Part 2: Regression

## Simple Linear Regression

### Preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

FileNotFoundError: ignored

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Training the model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

### Visualisation

In [None]:
plt.scatter(X)

## Multiple Linear Regression

Again like in Simple Linear Regression, use LinearRegression() class

Feature Scaling Not Required

## Polynomial Linear Regression

In [None]:
# Similar to linear regression, except get powers of feature using sklearn
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(degree=2)  # Change this, if needed. Too high degree -> Overfitting
X = [[1]]
X = pf.fit_transform(X)

# After this, train normal LinearRegression with X

## SVR

In [None]:
# Feature Scaling Required Always
from sklearn.svm import SVR

model = SVR(kernel='linear')  # Or 'rbf' for non-linear/ any other kernel (check documentation)

model.fit(X)

# Predict normally

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=42)

model.fit(X)  # As you usually would 
model.predict(X_test)  # Again, as usual 

## Random Forest

In [None]:
# Random Forest - teams of multiple Decision Trees

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10, random_state=42)

# As usual...
model.fit(X)
model.predict()

## Model Evaluation for Regression Models - R^2 Score 

In [None]:
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred)

# Part 3 - Classification

## Important Regarding Classification Models

Always scale the numeric values for classification related problems

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# As you would...
model.fit(X_train, y_train)
model.predict(X_test)

### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

# As you would...
model.fit(X_train, y_train)
model.predict(X_test)

NameError: ignored

### Linear SVC

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
# As you would...
model.fit(X_train, y_train)
model.predict(X_test)

NameError: ignored

### Kernel SVC (kernel='rbf')

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf')

# As you would...
model.fit(X_train, y_train)
model.predict(X_test)

NameError: ignored

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, y_train)
model.predict(X_test)

NameError: ignored

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)
model.predict(X_test)

NameError: ignored

### Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10)

model.fit(X_train, y_train)
model.predict(X_test)

### Evaluation of Classification Models - Accuracy, Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)

acc = accuracy_score(y_test, y_pred)
print(acc)

## Part 4 - Clustering

In [None]:
pass

## Part 5 - Association Rule Learning

### Apriori

In [None]:
!pip install apyori

Collecting apyori
  Downloading https://files.pythonhosted.org/packages/5e/62/5ffde5c473ea4b033490617ec5caa80d59804875ad3c3c57c0976533a21a/apyori-1.1.2.tar.gz
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-cp36-none-any.whl size=5975 sha256=4e7c23e3aee16f3beed882426aa6e1cbbaa5737e7cfcedf7d0b6f6299974408c
  Stored in directory: /root/.cache/pip/wheels/5d/92/bb/474bbadbc8c0062b9eb168f69982a0443263f8ab1711a8cad0
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('Market_Basket_Optimisation.csv', header=None)

l = []
for row in data.values:
  holder = list(set(row))
  if np.nan in holder:
    holder.remove(np.nan)
  l.append(holder)
print(l)

[['low fat yogurt', 'energy drink', 'antioxydant juice', 'shrimp', 'whole weat flour', 'avocado', 'olive oil', 'green grapes', 'tomato juice', 'yams', 'honey', 'frozen smoothie', 'cottage cheese', 'almonds', 'vegetables mix', 'spinach', 'salad', 'salmon', 'mineral water', 'green tea'], ['meatballs', 'eggs', 'burgers'], ['chutney'], ['avocado', 'turkey'], ['milk', 'whole wheat rice', 'energy bar', 'mineral water', 'green tea'], ['low fat yogurt'], ['whole wheat pasta', 'french fries'], ['light cream', 'shallot', 'soup'], ['spaghetti', 'frozen vegetables', 'green tea'], ['french fries'], ['pet food', 'eggs'], ['cookies'], ['eggs', 'burgers', 'turkey', 'cooking oil', 'mineral water'], ['champagne', 'spaghetti', 'cookies'], ['salmon', 'mineral water'], ['mineral water'], ['low fat yogurt', 'shrimp', 'chocolate', 'honey', 'chicken', 'cooking oil', 'oil'], ['eggs', 'turkey'], ['extra dark chocolate', 'black tea', 'eggs', 'fresh tuna', 'turkey', 'spaghetti', 'chicken', 'tomatoes', 'salmon', '

In [None]:
len(l)

7501

In [25]:
from apyori import apriori

rules = apriori(transactions=l, min_support=0.003, min_confidence=0.2,
                min_lift=3, min_length=2, max_length=2)

In [26]:
results = list(rules)

results

[RelationRecord(items=frozenset({'chicken', 'light cream'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)]),
 RelationRecord(items=frozenset({'escalope', 'mushroom cream sauce'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'mushroom cream sauce'}), items_add=frozenset({'escalope'}), confidence=0.3006993006993007, lift=3.790832696715049)]),
 RelationRecord(items=frozenset({'pasta', 'escalope'}), support=0.005865884548726837, ordered_statistics=[OrderedStatistic(items_base=frozenset({'pasta'}), items_add=frozenset({'escalope'}), confidence=0.3728813559322034, lift=4.700811850163794)]),
 RelationRecord(items=frozenset({'honey', 'fromage blanc'}), support=0.003332888948140248, ordered_statistics=[OrderedStatistic(items_base=frozenset({'fromage blanc'}), items_add=frozenset({'honey'}), confidence=0