In [3]:
import pandas as pd

# Sample data
data = {
    'Gender': ['Male',   'Female',       'Female',   'Male',   'Female',    ' Male'],
    'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Chicago', 'Los Angeles']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Using pd.crosstab to get frequency count of each gender in each city
result = pd.crosstab(df['Gender'], df['City'], margins=True)

print(result)
data

City    Chicago  Los Angeles  New York  All
Gender                                     
Female        1            1         1    3
Male          1            1         1    3
All           2            2         2    6


{'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
 'City': ['New York',
  'Los Angeles',
  'New York',
  'Chicago',
  'Chicago',
  'Los Angeles']}

In [5]:
from sklearn.datasets import make_classification
from collections import Counter

# Generate a simple dataset for demonstration
# Setting n_informative=2 and n_redundant=0 to match n_features=2
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_classes=2, weights=[0.7, 0.3], random_state=42)

# Calculate the majority class
most_common_class = Counter(y).most_common(1)[0][0]

# Predict the majority class for all instances
y_pred = [most_common_class] * len(y)

# Calculate accuracy of Naive Rule
accuracy = sum(y_pred == y) / len(y)
print(f"Naive Rule Accuracy: {accuracy:.2f}")


Naive Rule Accuracy: 0.70


In [10]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = 'wdbc.csv'
df = pd.read_csv(file_path, header=None)

# Renaming columns for clarity
column_names = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
df.columns = column_names

# Drop the ID column and encode 'Diagnosis'
df = df.drop(columns=['ID'])
df['Diagnosis'] = df['Diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

# Separate features and target variable
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr_model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 Score): {r2}")


Mean Squared Error (MSE): 0.06728376859363186
R-squared (R2 Score): 0.7108399944964154


In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.naive_bayes import GaussianNB

# Load the dataset
file_path = 'wdbc.csv'
df = pd.read_csv(file_path, header=None)

# Renaming columns for clarity
column_names = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
df.columns = column_names

# Drop the ID column and encode 'Diagnosis'
df = df.drop(columns=['ID'])
df['Diagnosis'] = df['Diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

# Separate features and target variable
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the baseline model (mean of the target variable)
baseline_prediction = y_test.mean()
baseline_mse = mean_squared_error(y_test, [baseline_prediction] * len(y_test))

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test data for Linear Regression
y_pred_lr = lr_model.predict(X_test)

# Calculate performance metrics for Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Initialize and train the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test data for Naive Bayes
y_pred_nb = nb_model.predict(X_test)

# Calculate performance metrics for Naive Bayes
mse_nb = mean_squared_error(y_test, y_pred_nb)
r2_nb = r2_score(y_test, y_pred_nb)

# Print the results
print(f"Baseline Model Mean Squared Error (MSE): {baseline_mse:.4f}")
print(f"Linear Regression Model Mean Squared Error (MSE): {mse_lr:.4f}, R-squared: {r2_lr:.4f}")
print(f"Naive Bayes Model Mean Squared Error (MSE): {mse_nb:.4f}, R-squared: {r2_nb:.4f}")

# Check if the linear regression model outperforms the baseline
if mse_lr < baseline_mse:
    print("The Linear Regression model has a better score than the baseline model.")
else:
    print("The Linear Regression model does not have a better score than the baseline model.")

# Check if the Naive Bayes model outperforms the baseline
if mse_nb < baseline_mse:
    print("The Naive Bayes model has a better score than the baseline model.")
else:
    print("The Naive Bayes model does not have a better score than the baseline model.")



Baseline Model Mean Squared Error (MSE): 0.2327
Linear Regression Model Mean Squared Error (MSE): 0.0673, R-squared: 0.7108
Naive Bayes Model Mean Squared Error (MSE): 0.0585, R-squared: 0.7487
The Linear Regression model has a better score than the baseline model.
The Naive Bayes model has a better score than the baseline model.


In [12]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [13]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
iris = fetch_ucirepo(id=53)

# data (as pandas dataframes)
X = iris.data.features
y = iris.data.targets

# metadata
print(iris.metadata)

# variable information
print(iris.variables)


{'uci_id': 53, 'name': 'Iris', 'repository_url': 'https://archive.ics.uci.edu/dataset/53/iris', 'data_url': 'https://archive.ics.uci.edu/static/public/53/data.csv', 'abstract': 'A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.\n', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 150, 'num_features': 4, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1936, 'last_updated': 'Tue Sep 12 2023', 'dataset_doi': '10.24432/C56C76', 'creators': ['R. A. Fisher'], 'intro_paper': {'ID': 191, 'type': 'NATIVE', 'title': 'The Iris data set: In search of the source of virginica', 'authors': 'A. Unwin, K. Kleinman', 'venue': 'Significance, 2021', 'year': 2021, 'journal': 'Significance, 2021', 'DOI': '1740-9713.01589', 'URL': 'https://www.semanticscholar.org

In [14]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the baseline model (Logistic Regression)
baseline_model = LogisticRegression(max_iter=200)
baseline_model.fit(X_train, y_train)
baseline_predictions = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, baseline_predictions)

# Define and train a more complex model (Random Forest Classifier)
ml_model = RandomForestClassifier(n_estimators=100, random_state=42)
ml_model.fit(X_train, y_train)
ml_predictions = ml_model.predict(X_test)
ml_accuracy = accuracy_score(y_test, ml_predictions)

# Print the results
print(f"Baseline Model Accuracy: {baseline_accuracy:.2f}")
print(f"ML Model Accuracy: {ml_accuracy:.2f}")

# Assert that the ML model should have a better accuracy than the baseline
ml_accuracy > baseline_accuracy, "ML model should have a better score than the baseline model"


Baseline Model Accuracy: 1.00
ML Model Accuracy: 1.00


(False, 'ML model should have a better score than the baseline model')