In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
###################################################
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# Importing some classes to perform various Ensemble techniques
'''
Let me briefly describe what each of these classes does:
1. BaggingClassifier: Implements ensemble Bagging algorithms, such as Random Forest.

2. RandomForestClassifier: Implements the Random Forest algorithm, which is an ensemble 
of decision trees trained with bagging.

3. GradientBoostingClassifier: Implements gradient boosting, a technique that builds an 
ensemble of weak learners (often decision trees) in a sequential manner, where each new 
learner corrects errors made by the previous one.

4. AdaBoostClassifier: Implements AdaBoost (Adaptive Boosting), a boosting algorithm that 
combines multiple weak learners (typically shallow decision trees) to create a strong classifier.

5. VotingClassifier: Implements voting ensembles, which combine the predictions from 
multiple classifiers either by majority voting (hard voting) or by averaging the predicted 
probabilities (soft voting)
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

# To perform Hyperparamter Tunning and Cross Validation of the classfication models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
###################################################
#Space for making a Streamlit update

In [None]:
'''The dataset we will use is iris dataset which wil present in the seaborn module of python.
1. The Iris dataset is a famous dataset in the field of machine learning and statistics.
2. It contains information about various species of iris flowers, with measurements of their sepal length, sepal width, petal length, and petal width.
3. These measurements are used to classify the iris flowers into three species: Setosa, Versicolor, and Virginica.
'''
iris_df=sns.load_dataset('iris')
iris_df.sample(10)

The line of code prints the shape of the Iris dataset, indicating the number of rows (instances) and features (columns) it contains.

In [None]:
print("The shape of Iris dataset is :{0} rows and {1} features.".format(iris_df.shape[0],iris_df.shape[1]))

In [None]:
#Writing a code to find if we have null values in any of the features.
pd.DataFrame(iris_df.isna().sum(),columns=['No of null values'])


Performing multivariate and univariate analysis on the Iris dataset involves examining the relationships between multiple variables (multivariate analysis) and analyzing individual variables separately (univariate analysis).

In [None]:
plt.figure(figsize=(7,7))
sns.countplot(data=iris_df,x='species')
plt.title('Count plot for No. of samples of each species')

In [None]:
sns.pairplot(data=iris_df,hue='species')
plt.show()

From the pair plot we can observe relationship between different features with each other. We dont need to perform any data preprocessing on the dataset as it as a 'Healthy Dataset' ie. it have no missing values and the distribution is aprrox normal. A "healthy dataset" with no missing values and a normal distribution can make the modeling process smoother and more straightforward. However, keep in mind that even in such cases, it's still a good practice to perform exploratory data analysis (EDA) to gain insights into the data and understand its characteristics better. 

Encoding categorical variables is essential for machine learning as it converts categorical data into numerical form, enabling algorithms to process it effectively. 

In [None]:
encoder=LabelEncoder() #we use LabelEncoder because we have nominal categories
iris_df['species']=encoder.fit_transform(iris_df['species'])

Performing a train-test split is crucial in machine learning for evaluating model performance. This step divides the dataset into training and testing sets, allowing the model to learn patterns from the training data and then assess its performance on unseen data from the testing set.


In [None]:
X_train,X_test,y_train,y_test=train_test_split(iris_df.iloc[:,0:4],iris_df.iloc[:,-1],test_size=0.2,random_state=42)
print("New shape of training dataset is : ",X_train.shape)
print("New shape of testing dataset is : ",X_test.shape)

In [None]:
lr=LogisticRegression()
dt=DecisionTreeClassifier()
rf=RandomForestClassifier(n_estimators=100)
gbc=GradientBoostingClassifier(n_estimators=100)
ada=AdaBoostClassifier(estimator=Decision_tree,n_estimators=100)
bag=BaggingClassifier(estimator=None,n_estimators=100)
estimators=[lr,dt,rf,gbc,ada,bag]

In [None]:
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
accuracy_score(y_test,y_pred)