In [None]:
import pandas as pd
import random

# Define schema
columns = ["RecordNumber", "Country", "City", "Zipcode", "State"]

# Sample data from simple-zipcodes.csv
cities = ["PARC PARQUE", "PASEO COSTA DEL SUR", "BDA SAN LUIS", "HOLT", "HOMOSASSA",
          "CINGULAR WIRELESS", "FORT WORTH", "FT WORTH", "SPRUCE PINE", "ASH HILL",
          "URB EUGENE RICE", "MESA", "HILLIARD", "HOLDER", "SECT LANAUSSE",
          "SPRING GARDEN", "SPRINGVILLE", "ASHEBORO"]
states = ["PR", "FL", "TX", "AL", "NC", "AZ"]
zipcodes = ["704", "709", "32564", "34487", "76166", "76177", "35585", "27007",
           "85209", "85210", "32046", "34445", "35146", "27203", "27204"]

# Generate 100,000 rows
num_rows = 100000
data = [
    (i, "US", random.choice(cities), random.choice(zipcodes), random.choice(states))
    for i in range(1, num_rows + 1)
]

# Create DataFrame
df = pd.DataFrame(data, columns=columns)
df.head()

In [None]:
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print(df.isnull().sum())

In [None]:
import plotly.express as px

# Group by state and count zip codes
zip_by_state = df.groupby("State").size().reset_index(name="zip_count")

# Create interactive bar chart
fig1 = px.bar(zip_by_state, x="State", y="zip_count",
              title="Distribution of Zip Codes by State",
              labels={"zip_count": "Number of Zip Codes", "State": "State"},
              color="State")
fig1.update_layout(showlegend=False)
fig1.show()

# Export as HTML
fig1.write_html("zip_by_state.html")

In [None]:
# Group by city and count occurrences
top_cities = df.groupby("City").size().reset_index(name="count").sort_values("count", ascending=False).head(10)

# Create interactive bar chart
fig2 = px.bar(top_cities, x="count", y="City",
              title="Top 10 Cities by Frequency",
              labels={"count": "Number of Records", "City": "City"},
              color="City")
fig2.update_layout(showlegend=False)
fig2.show()

# Export as HTML
fig2.write_html("top_cities.html")

In [None]:
# Sample 1000 rows for scatter plot to reduce rendering time
df_sample = df.sample(n=1000, random_state=42)

# Create interactive scatter plot
fig3 = px.scatter(df_sample, x="RecordNumber", y="State",
                  title="RecordNumber Distribution by State (Sampled)",
                  labels={"RecordNumber": "Record Number", "State": "State"},
                  color="State")
fig3.update_layout(showlegend=True)
fig3.show()

# Export as HTML
fig3.write_html("record_by_state.html")