In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
plt.style.use('seaborn-whitegrid')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")


In [None]:
# Load the Iris dataset from seaborn
iris = sns.load_dataset('iris')

# Display the first few rows
print("First 5 rows of the Iris dataset:")
print(iris.head())

# Dataset shape
print("\nDataset shape:", iris.shape)

# Basic information
print("\nDataset information:")
print(iris.info())

# Basic statistics
print("\nBasic statistics:")
print(iris.describe())


In [None]:
# Class distribution
print("Class distribution:")
print(iris['species'].value_counts())
print("\nClass distribution (percentage):")
print(iris['species'].value_counts(normalize=True) * 100)

# Group by species and calculate statistics
print("\nStatistics by species:")
species_stats = iris.groupby('species').describe().transpose()
print(species_stats)

# Calculate correlation matrix
print("\nCorrelation matrix:")
correlation_matrix = iris.corr()
print(correlation_matrix)


In [None]:
# Histograms for each feature
plt.figure(figsize=(15, 10))

for i, feature in enumerate(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']):
    plt.subplot(2, 2, i+1)
    for species in iris['species'].unique():
        plt.hist(iris[iris['species'] == species][feature], 
                 alpha=0.7, 
                 bins=15, 
                 label=species)
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {feature}')
    plt.legend()

plt.tight_layout()
plt.show()

# Kernel Density Estimation (KDE) plots
plt.figure(figsize=(15, 10))

for i, feature in enumerate(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']):
    plt.subplot(2, 2, i+1)
    for species in iris['species'].unique():
        sns.kdeplot(iris[iris['species'] == species][feature], 
                   label=species, 
                   shade=True)
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.title(f'Density Distribution of {feature}')
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Box plots for each feature by species
plt.figure(figsize=(15, 10))

for i, feature in enumerate(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='species', y=feature, data=iris)
    plt.title(f'Box Plot of {feature} by Species')
    plt.xlabel('Species')
    plt.ylabel(feature)

plt.tight_layout()
plt.show()

# Violin plots for each feature by species
plt.figure(figsize=(15, 10))

for i, feature in enumerate(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']):
    plt.subplot(2, 2, i+1)
    sns.violinplot(x='species', y=feature, data=iris, inner='quartile')
    plt.title(f'Violin Plot of {feature} by Species')
    plt.xlabel('Species')
    plt.ylabel(feature)

plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()

# Pairplot to show relationships between features
plt.figure(figsize=(15, 10))
sns.pairplot(iris, hue='species', markers=['o', 's', 'D'], height=2.5)
plt.suptitle('Pairplot of Iris Dataset Features by Species', y=1.02)
plt.show()

# Calculate correlation by species
print("Correlation by species:")
for species in iris['species'].unique():
    print(f"\nCorrelation matrix for {species}:")
    species_data = iris[iris['species'] == species].drop('species', axis=1)
    print(species_data.corr())


In [None]:
# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the features
features = iris.drop('species', axis=1)
X = StandardScaler().fit_transform(features)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['species'] = iris['species']

# Plot the PCA results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='species', data=pca_df, s=100, alpha=0.8)
plt.title('PCA of Iris Dataset')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Species')
plt.tight_layout()
plt.show()

# Print explained variance ratio
print("Explained variance ratio:")
print(pca.explained_variance_ratio_)
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.2%}")

# PCA loading scores
loading_scores = pd.DataFrame(
    data=pca.components_.T * np.sqrt(pca.explained_variance_), 
    columns=['PC1', 'PC2'],
    index=features.columns
)
print("\nPCA loading scores:")
print(loading_scores)


In [None]:
# t-SNE visualization
from sklearn.manifold import TSNE

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(X)
tsne_df = pd.DataFrame(data=tsne_results, columns=['t-SNE1', 't-SNE2'])
tsne_df['species'] = iris['species']

# Plot the t-SNE results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='t-SNE1', y='t-SNE2', hue='species', data=tsne_df, s=100, alpha=0.8)
plt.title('t-SNE of Iris Dataset')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Species')
plt.tight_layout()
plt.show()


In [None]:
# ANOVA test to check if there are significant differences between species
from scipy.stats import f_oneway

print("One-way ANOVA test results:")
for feature in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']:
    setosa = iris[iris['species'] == 'setosa'][feature]
    versicolor = iris[iris['species'] == 'versicolor'][feature]
    virginica = iris[iris['species'] == 'virginica'][feature]
    
    f_stat, p_value = f_oneway(setosa, versicolor, virginica)
    print(f"{feature}: F-statistic = {f_stat:.4f}, p-value = {p_value:.10f}")
    
    if p_value < 0.05:
        print("   Significant difference exists between species\n")
    else:
        print("   No significant difference between species\n")

# Shapiro-Wilk test for normality
from scipy.stats import shapiro

print("Shapiro-Wilk test for normality:")
for feature in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']:
    stat, p = shapiro(iris[feature])
    print(f"{feature}: W = {stat:.4f}, p-value = {p:.10f}")
    
    if p > 0.05:
        print("   Data appears to be normally distributed\n")
    else:
        print("   Data does not appear to be normally distributed\n")


In [None]:
# Correlation test (Pearson)
from scipy.stats import pearsonr

print("Pearson correlation tests:")
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

for i in range(len(features)):
    for j in range(i+1, len(features)):
        corr, p = pearsonr(iris[features[i]], iris[features[j]])
        print(f"{features[i]} vs {features[j]}: r = {corr:.4f}, p-value = {p:.10f}")
        
        if p < 0.05:
            if corr > 0:
                print(f"   Significant positive correlation\n")
            else:
                print(f"   Significant negative correlation\n")
        else:
            print(f"   No significant correlation\n")


In [None]:
# K-means clustering
from sklearn.cluster import KMeans

# Apply K-means with 3 clusters (since we know there are 3 species)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# Create a DataFrame with the results
kmeans_df = pd.DataFrame(data=X, columns=features.columns)
kmeans_df['cluster'] = kmeans_labels
kmeans_df['actual_species'] = iris['species']

# Map cluster numbers to species names for better comparison
cluster_species_map = {}
for cluster in range(3):
    species_in_cluster = kmeans_df[kmeans_df['cluster'] == cluster]['actual_species'].value_counts()
    dominant_species = species_in_cluster.idxmax()
    cluster_species_map[cluster] = dominant_species

kmeans_df['predicted_species'] = kmeans_df['cluster'].map(cluster_species_map)

# Visualize clusters using PCA
plt.figure(figsize=(15, 6))

# Plot 1: K-means clusters
plt.subplot(1, 2, 1)
sns.scatterplot(x=principal_components[:, 0], y=principal_components[:, 1], 
                hue=kmeans_labels, palette='viridis', s=100, alpha=0.8)
plt.title('K-means Clustering (3 clusters)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')

# Plot 2: Actual species
plt.subplot(1, 2, 2)
sns.scatterplot(x=principal_components[:, 0], y=principal_components[:, 1], 
                hue=iris['species'], palette='viridis', s=100, alpha=0.8)
plt.title('Actual Species')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Species')

plt.tight_layout()
plt.show()

# Confusion matrix between clusters and actual species
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(iris['species'], kmeans_df['predicted_species'])
conf_df = pd.DataFrame(conf_matrix, 
                      index=iris['species'].unique(), 
                      columns=iris['species'].unique())

plt.figure(figsize=(8, 6))
sns.heatmap(conf_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix: Actual vs. Predicted Species')
plt.xlabel('Predicted Species')
plt.ylabel('Actual Species')
plt.tight_layout()
plt.show()

# Calculate accuracy
accuracy = (kmeans_df['actual_species'] == kmeans_df['predicted_species']).mean()
print(f"K-means clustering accuracy: {accuracy:.2%}")


In [None]:
# Random Forest for feature importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Encode species labels
le = LabelEncoder()
y = le.fit_transform(iris['species'])

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature importance:")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis')
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
