# Statistics Basics


1. How do you calculate the mean, median, and mode of a dataset?

In [None]:
import numpy as np
from scipy import stats

# Dataset
data = [2, 4, 4, 6, 8]

# Calculate mean
mean = np.mean(data)

# Calculate median
median = np.median(data)

# Calculate mode
mode = stats.mode(data)

print("Mean:", mean)
print("Median:", median)
print("Mode:", mode)


2. Write a Python program to compute the variance and standard deviation of a dataset.

In [None]:
import numpy as np

# Dataset
data = [2, 4, 4, 6, 8]

# Calculate variance
variance = np.var(data)

# Calculate standard deviation
std_dev = np.std(data)

print("Variance:", variance)
print("Standard Deviation:", std_dev)


3.  Create a dataset and classify it into nominal, ordinal, interval, and ratio types.

In [None]:
import pandas as pd

# Create a dataset
data = {
    "Student ID": [1, 2, 3, 4, 5],
    "Name": ["John", "Jane", "Bob", "Alice", "Mike"],
    "Grade Level": ["Freshman", "Sophomore", "Junior", "Senior", "Freshman"],
    "Satisfaction Rating": ["Very Satisfied", "Satisfied", "Neutral", "Dissatisfied", "Very Dissatisfied"],
    "Score": [85, 90, 78, 92, 88],
    "Age": [18, 19, 20, 21, 18]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
print(df)

# Classify data types
print("\nData Types:")
print(df.dtypes)



4. Implement sampling techniques like random sampling and stratified sampling.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Create a sample dataset
data = {
    "Feature1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "Feature2": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    "Target": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
}

df = pd.DataFrame(data)

## Random Sampling
random_sample = df.sample(n=5, random_state=42)
print("Random Sample:")
print(random_sample)

## Stratified Sampling
# Define features and target
X = df[["Feature1", "Feature2"]]
y = df["Target"]

# Perform stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

# Combine features and target for train and test sets
train_set = pd.concat([X_train, y_train], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

print("\nStratified Train Set:")
print(train_set)
print("\nStratified Test Set:")
print(test_set)


5. Write a Python function to calculate the range of a dataset.

In [None]:
def calculate_range(data):
    return max(data) - min(data)

# Example usage:
data = [2, 4, 4, 6, 8]
range_value = calculate_range(data)
print("Range:", range_value)


6. Create a dataset and plot its histogram to visualize skewness.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Create a skewed dataset
np.random.seed(0)
data = np.random.exponential(scale=1, size=1000)

# Create a DataFrame
df = pd.DataFrame(data, columns=["Value"])

# Plot histogram
plt.hist(df["Value"], bins=30, alpha=0.7, color="blue", edgecolor="black")
plt.title("Histogram of Skewed Data")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()



7.  Calculate skewness and kurtosis of a dataset using Python libraries.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Create a dataset
data = np.random.normal(0, 1, 1000)

# Calculate skewness
skewness = stats.skew(data)

# Calculate kurtosis
kurtosis = stats.kurtosis(data)

print("Skewness:", skewness)
print("Kurtosis:", kurtosis)


8. Generate a dataset and demonstrate positive and negative skewness.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(0)

# Generate positively skewed dataset
positive_skew = np.random.exponential(scale=1, size=1000)

# Generate negatively skewed dataset
negative_skew = -np.random.exponential(scale=1, size=1000)

# Plot histograms
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

ax[0].hist(positive_skew, bins=30, alpha=0.7, color="blue", edgecolor="black")
ax[0].set_title("Positive Skewness")
ax[0].set_xlabel("Value")
ax[0].set_ylabel("Frequency")

ax[1].hist(negative_skew, bins=30, alpha=0.7, color="red", edgecolor="black")
ax[1].set_title("Negative Skewness")
ax[1].set_xlabel("Value")
ax[1].set_ylabel("Frequency")

plt.show()


9.  Write a Python script to calculate covariance between two datasets.

In [None]:
import numpy as np

# Create two datasets
np.random.seed(0)
dataset1 = np.random.normal(0, 1, 100)
dataset2 = np.random.normal(0, 1, 100)

# Calculate covariance
covariance = np.cov(dataset1, dataset2)[0, 1]

print("Covariance:", covariance)


10. Write a Python script to calculate the correlation coefficient between two datasets.

In [None]:
#Using Numpy

import numpy as np

# Create two datasets
np.random.seed(0)
dataset1 = np.random.normal(0, 1, 100)
dataset2 = 0.8 * dataset1 + np.random.normal(0, 0.2, 100)

# Calculate correlation coefficient
correlation_coefficient = np.corrcoef(dataset1, dataset2)[0, 1]

print("Correlation Coefficient:", correlation_coefficient)


#Using Pandas

import pandas as pd
import numpy as np

# Create two datasets
np.random.seed(0)
dataset1 = np.random.normal(0, 1, 100)
dataset2 = 0.8 * dataset1 + np.random.normal(0, 0.2, 100)

# Create a DataFrame
df = pd.DataFrame({"Dataset1": dataset1, "Dataset2": dataset2})

# Calculate correlation coefficient
correlation_coefficient = df["Dataset1"].corr(df["Dataset2"])

print("Correlation Coefficient:", correlation_coefficient)


11. Create a scatter plot to visualize the relationship between two variables.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create two datasets
np.random.seed(0)
x = np.random.normal(0, 1, 100)
y = 0.8 * x + np.random.normal(0, 0.2, 100)

# Create scatter plot
plt.scatter(x, y, alpha=0.7, color="blue", edgecolor="black")
plt.title("Scatter Plot of X vs Y")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()


12. Implement and compare simple random sampling and systematic sampling.

In [None]:
import numpy as np
import pandas as pd

# Create a population dataset
np.random.seed(0)
population = np.arange(1, 101)

## Simple Random Sampling
random_sample = np.random.choice(population, size=10, replace=False)
print("Simple Random Sample:")
print(random_sample)

## Systematic Sampling
k = 10  # sampling interval
start_index = np.random.randint(0, k)
systematic_sample = population[start_index::k]
print("\nSystematic Sample:")
print(systematic_sample)


13. Calculate the mean, median, and mode of grouped data.

In [None]:
import pandas as pd
import numpy as np

# Create grouped data
data = {
    "Class Interval": ["0-10", "10-20", "20-30", "30-40", "40-50"],
    "Frequency": [5, 10, 15, 8, 2]
}

df = pd.DataFrame(data)

# Calculate midpoints
midpoints = [(0+10)/2, (10+20)/2, (20+30)/2, (30+40)/2, (40+50)/2]
df["Midpoint"] = midpoints

# Calculate mean
df["Product"] = df["Midpoint"] * df["Frequency"]
mean = df["Product"].sum() / df["Frequency"].sum()
print("Mean:", mean)

# Calculate median
cf = df["Frequency"].cumsum()
median_class = df.loc[(cf >= df["Frequency"].sum()/2).idxmax()]
median = median_class["Midpoint"] - ((df["Frequency"].sum()/2 - (cf - median_class["Frequency"]).max()) / median_class["Frequency"]) * 10
print("Median:", median)

# Calculate mode
mode_class = df.loc[df["Frequency"].idxmax()]
mode = mode_class["Midpoint"]
print("Mode:", mode)



14.  Simulate data using Python and calculate its central tendency and dispersion.

In [None]:
import numpy as np
import pandas as pd

# Simulate data
np.random.seed(0)
data = np.random.normal(loc=5, scale=2, size=100)

# Calculate central tendency
mean = np.mean(data)
median = np.median(data)
mode = pd.Series(data).mode().values[0]

print("Central Tendency:")
print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")

# Calculate dispersion
range_value = np.ptp(data)
variance = np.var(data)
std_dev = np.std(data)
iqr = np.percentile(data, 75) - np.percentile(data, 25)

print("\nDispersion:")
print(f"Range: {range_value}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {std_dev}")
print(f"Interquartile Range (IQR): {iqr}")


15.  Use NumPy or pandas to summarize a dataset’s descriptive statistics.

In [None]:
import pandas as pd
import numpy as np

# Create a dataset
np.random.seed(0)
data = np.random.normal(loc=5, scale=2, size=100)

# Create a pandas Series
series = pd.Series(data)

# Calculate descriptive statistics
stats = series.describe()

print(stats)


16. Plot a boxplot to understand the spread and identify outliers.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create a dataset
np.random.seed(0)
data = np.random.normal(loc=5, scale=2, size=100)

# Add some outliers
data = np.append(data, [15, 16, 17])

# Create a boxplot
plt.figure(figsize=(8, 6))
plt.boxplot(data, vert=False)
plt.title("Boxplot of Data")
plt.show()


17. Calculate the interquartile range (IQR) of a dataset.

In [None]:
import numpy as np

# Create a dataset
np.random.seed(0)
data = np.random.normal(loc=5, scale=2, size=100)

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)

# Calculate IQR
IQR = Q3 - Q1

print("Q1 (25th percentile):", Q1)
print("Q3 (75th percentile):", Q3)
print("Interquartile Range (IQR):", IQR)


18.  Implement Z-score normalization and explain its significance.

In [None]:
import numpy as np

# Create a dataset
np.random.seed(0)
data = np.random.normal(loc=5, scale=2, size=100)

# Calculate mean and standard deviation
mean = np.mean(data)
std_dev = np.std(data)

# Apply Z-score normalization
normalized_data = (data - mean) / std_dev

print("Original Data Mean:", np.mean(data))
print("Original Data Standard Deviation:", np.std(data))
print("Normalized Data Mean:", np.mean(normalized_data))
print("Normalized Data Standard Deviation:", np.std(normalized_data))


19. Compare two datasets using their standard deviations.

In [None]:
import numpy as np

# Create two datasets
np.random.seed(0)
dataset1 = np.random.normal(loc=5, scale=1, size=100)
dataset2 = np.random.normal(loc=5, scale=2, size=100)

# Calculate standard deviations
std_dev1 = np.std(dataset1)
std_dev2 = np.std(dataset2)

print("Dataset 1 Standard Deviation:", std_dev1)
print("Dataset 2 Standard Deviation:", std_dev2)

# Compare standard deviations
if std_dev1 < std_dev2:
    print("Dataset 1 has less variability than Dataset 2.")
elif std_dev1 > std_dev2:
    print("Dataset 1 has more variability than Dataset 2.")
else:
    print("Both datasets have similar variability.")


20. Write a Python program to visualize covariance using a heatmap.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a dataset
np.random.seed(0)
data = np.random.multivariate_normal(mean=[0, 0, 0, 0], cov=[[1, 0.5, 0.2, 0.1], [0.5, 1, 0.3, 0.2], [0.2, 0.3, 1, 0.4], [0.1, 0.2, 0.4, 1]], size=100)

# Create a DataFrame
df = pd.DataFrame(data, columns=["Feature1", "Feature2", "Feature3", "Feature4"])

# Calculate covariance matrix
cov_matrix = df.cov()

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cov_matrix, annot=True, cmap="coolwarm", square=True)
plt.title("Covariance Matrix Heatmap")
plt.show()



21.  Use seaborn to create a correlation matrix for a dataset.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Create a dataset
np.random.seed(0)
data = np.random.multivariate_normal(mean=[0, 0, 0, 0], cov=[[1, 0.5, 0.2, 0.1], [0.5, 1, 0.3, 0.2], [0.2, 0.3, 1, 0.4], [0.1, 0.2, 0.4, 1]], size=100)

# Create a DataFrame
df = pd.DataFrame(data, columns=["Feature1", "Feature2", "Feature3", "Feature4"])

# Calculate correlation matrix
corr_matrix = df.corr()

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", square=True)
plt.title("Correlation Matrix")
plt.show()



22.  Generate a dataset and implement both variance and standard deviation computations.

In [None]:
import numpy as np

# Generate a dataset
np.random.seed(0)
data = np.random.normal(loc=5, scale=2, size=100)

# Compute variance
variance = np.var(data)

# Compute standard deviation
std_dev = np.std(data)

print("Dataset Mean:", np.mean(data))
print("Variance:", variance)
print("Standard Deviation:", std_dev)


23. Visualize skewness and kurtosis using Python libraries like matplotlib or seaborn.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Generate skewed data
np.random.seed(0)
skewed_data = np.random.exponential(scale=1, size=1000)

# Generate normal data
normal_data = np.random.normal(loc=0, scale=1, size=1000)

# Calculate skewness and kurtosis
skewness_skewed = stats.skew(skewed_data)
kurtosis_skewed = stats.kurtosis(skewed_data)
skewness_normal = stats.skew(normal_data)
kurtosis_normal = stats.kurtosis(normal_data)

print("Skewed Data Skewness:", skewness_skewed)
print("Skewed Data Kurtosis:", kurtosis_skewed)
print("Normal Data Skewness:", skewness_normal)
print("Normal Data Kurtosis:", kurtosis_normal)

# Create histograms
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(skewed_data, kde=True)
plt.title("Skewed Data")

plt.subplot(1, 2, 2)
sns.histplot(normal_data, kde=True)
plt.title("Normal Data")

plt.show()




24.  Implement the Pearson and Spearman correlation coefficients for a dataset.

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# Generate a dataset
np.random.seed(0)
data = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0.7], [0.7, 1]], size=100)

# Create a DataFrame
df = pd.DataFrame(data, columns=["Feature1", "Feature2"])

# Calculate Pearson correlation coefficient
pearson_coef, pearson_p = pearsonr(df["Feature1"], df["Feature2"])
print("Pearson Correlation Coefficient:", pearson_coef)
print("Pearson p-value:", pearson_p)

# Calculate Spearman correlation coefficient
spearman_coef, spearman_p = spearmanr(df["Feature1"], df["Feature2"])
print("Spearman Correlation Coefficient:", spearman_coef)
print("Spearman p-value:", spearman_p)
