# **1. Create Your Own Dataset** #

> **We will construct a 5x50 table using lists, with different data types, then convert to data-frames**

In [116]:
import numpy as np
import pandas as pd
import random as rd

**Create our data and structure**

In [117]:
rowNum = 50

In [118]:
## Generate a random orderNum (int);
orderNum = rd.randint(1000,100000)
## Generate a random orderValue (float)
orderVal = 0.00
## Generate destination cities (String)
location = ["Houston","Seattle","Billings","Los Angeles","Hartford", "Miami", "New York"]
## Random dates
date = ["10/6/2025", "10/5/2025", "10/7/2025","10/9/2025"]
## Random free shipping
shipping = [True,False]

**Create our shipping table**

In [119]:
orders = []
for i in range(1,rowNum + 1):
    row = {
        "Order Number": i + orderNum,
        "Order Value": round(rd.uniform(100,10000), 2),
        "Location": rd.choice(location),
        "Date": rd.choice(date),
        "Free Shipping": rd.choice(shipping)
    }
    orders.append(row)

**Convert to a DataFrame**

In [120]:
## create and print our dataframe
ordersDf = pd.DataFrame(orders)
ordersDf

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping
0,32758,4822.6,Seattle,10/6/2025,True
1,32759,8655.98,Seattle,10/7/2025,False
2,32760,2566.93,New York,10/6/2025,True
3,32761,4909.35,Billings,10/5/2025,False
4,32762,9351.57,Houston,10/9/2025,True
5,32763,3826.43,Miami,10/7/2025,True
6,32764,1730.31,Hartford,10/6/2025,True
7,32765,490.35,Houston,10/5/2025,False
8,32766,5452.63,Hartford,10/5/2025,False
9,32767,8372.09,Billings,10/9/2025,True


## **2. Data Exploration W/ Pandas**

In [121]:
## look at our first 5
firstArr = ordersDf.iloc[0:5]
## look at our last 5
lastArr = ordersDf.iloc[44:49]
## print our first 5
firstArr

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping
0,32758,4822.6,Seattle,10/6/2025,True
1,32759,8655.98,Seattle,10/7/2025,False
2,32760,2566.93,New York,10/6/2025,True
3,32761,4909.35,Billings,10/5/2025,False
4,32762,9351.57,Houston,10/9/2025,True


In [122]:
## print our last 5
lastArr

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping
44,32802,6788.02,Billings,10/7/2025,True
45,32803,4464.71,Houston,10/7/2025,True
46,32804,1350.02,Houston,10/9/2025,True
47,32805,8569.03,Miami,10/9/2025,False
48,32806,1644.53,Billings,10/6/2025,False


In [123]:
## print our data types
ordersDf.dtypes

Order Number       int64
Order Value      float64
Location          object
Date              object
Free Shipping       bool
dtype: object

In [124]:
## print info as a data frame
ordersDf.describe()

Unnamed: 0,Order Number,Order Value
count,50.0,50.0
mean,32782.5,4861.6382
std,14.57738,2995.464375
min,32758.0,490.35
25%,32770.25,1899.9375
50%,32782.5,4719.475
75%,32794.75,7697.2
max,32807.0,9755.19


In [125]:
## Print the info as plain text
ordersDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order Number   50 non-null     int64  
 1   Order Value    50 non-null     float64
 2   Location       50 non-null     object 
 3   Date           50 non-null     object 
 4   Free Shipping  50 non-null     bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 1.7+ KB


In [126]:
## we can use .unique() to find unique values
print(ordersDf["Order Value"].unique())
## Seperator for cleanliness
print("\n")
## specifics per attribute
print(ordersDf["Location"].value_counts())
## Seperator for cleanliness
print("\n")
## or we can use .nunique for the whole table
print(ordersDf.nunique())

[4822.6  8655.98 2566.93 4909.35 9351.57 3826.43 1730.31  490.35 5452.63
 8372.09 1895.44 5509.2  1913.43 3494.01 6640.39 6652.17 7782.18 6265.62
 4528.11 9208.47 3310.71 9755.19 1120.52 7340.99 4616.35 2825.72 8451.56
 8224.05 6967.47 1479.28  940.71 9487.09 7864.77 1408.22 2108.77  547.18
 3719.68 7442.26 2325.78 6858.4  7994.   8466.69  626.81 1129.64 6788.02
 4464.71 1350.02 8569.03 1644.53 1186.5 ]


Location
Billings       12
Seattle         9
Houston         8
Hartford        7
New York        6
Miami           5
Los Angeles     3
Name: count, dtype: int64


Order Number     50
Order Value      50
Location          7
Date              4
Free Shipping     2
dtype: int64


## **3. Column Challenge**

> **We can add a column using PANDAS**

In [127]:
ordersDf["Delivered"] = (ordersDf["Free Shipping"] != True) & (ordersDf["Date"] >= "10/7/2025")

In [128]:
ordersDf

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping,Delivered
0,32758,4822.6,Seattle,10/6/2025,True,False
1,32759,8655.98,Seattle,10/7/2025,False,True
2,32760,2566.93,New York,10/6/2025,True,False
3,32761,4909.35,Billings,10/5/2025,False,False
4,32762,9351.57,Houston,10/9/2025,True,False
5,32763,3826.43,Miami,10/7/2025,True,False
6,32764,1730.31,Hartford,10/6/2025,True,False
7,32765,490.35,Houston,10/5/2025,False,False
8,32766,5452.63,Hartford,10/5/2025,False,False
9,32767,8372.09,Billings,10/9/2025,True,False


In [129]:
## Lets save this to a CSV file,
ordersDf.to_csv("MaxShuford_Lab1_Columns.csv")