# Foundations: pandas data analysis and visualization


In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


## pandas basics

In [None]:
# Create a DataFrame from a Python dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 40],
    "score": [85.5, 92.0, 88.0, 95.5],
}
df = pd.DataFrame(data)
print("DataFrame:\n", df, "\n")

# Basic selection and summary statistics
print("Names column:\n", df["name"])
print("Average age:", df["age"].mean())
print("Describe scores:\n", df["score"].describe())

# Simple visualization of the scores
df.plot.bar(x="name", y="score", title="Participant scores", legend=False)
plt.xlabel("name")
plt.ylabel("score")
plt.tight_layout()
plt.show()

## Exploratory data analysis

In [None]:
# Load the Iris dataset from scikit-learn and put it in a DataFrame
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target

# Display the first few rows
print("First five rows:\n", df.head(), "\n")

# Summary statistics
print("Summary statistics:\n", df.describe(), "\n")

# Scatter plot of two features
df.plot.scatter(x="sepal length (cm)", y="petal length (cm)", c="target", cmap="viridis")
plt.title("Iris feature scatter plot")
plt.show()

# Histogram of petal widths
df["petal width (cm)"].hist(bins=20)
plt.title("Petal width distribution")
plt.xlabel("width (cm)")
plt.ylabel("count")
plt.show()

## CSV input and output

In [None]:
# Create a simple DataFrame
data = {
    "city": ["San Diego", "Los Angeles", "San Francisco"],
    "population": [1.4, 3.9, 0.88],
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df, "\n")

# Write the DataFrame to CSV
csv_path = "cities.csv"
df.to_csv(csv_path, index=False)
print(f"Data written to {csv_path}")

# Read the file back in
loaded = pd.read_csv(csv_path)
print("\nLoaded from CSV:\n", loaded)

## DataFrame merging and grouping

In [None]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv'
titanic = pd.read_csv(url)
class_map = pd.DataFrame({
    'class': ['First', 'Second', 'Third'],
    'luxury_level': ['high', 'medium', 'low']
})
merged = titanic.merge(class_map, on='class')
summary = merged.groupby('luxury_level').agg({'fare': 'mean', 'survived': 'mean'})
summary


## Advanced data visualization


In [None]:
tips = sns.load_dataset('tips')
print(tips.head(), '
')
sns.pairplot(tips, hue='sex')
plt.suptitle('Pairplot of tips dataset', y=1.02)
plt.show()
corr = tips.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation heatmap')
plt.show()