# **1. Create Your Own Dataset** #

> **We will construct a 5x50 table using lists, with different data types, then convert to data-frames**

In [145]:
import numpy as np
import pandas as pd
import random as rd

**Create our data and structure**

In [146]:
rowNum = 50

In [147]:
## Generate a random orderNum (int);
orderNum = rd.randint(1000,100000)
## Generate a random orderValue (float)
orderVal = 0.00
## Generate destination cities (String)
location = ["Houston","Seattle","Billings","Los Angeles","Hartford", "Miami", "New York"]
## Random dates
date = ["10/6/2025", "10/5/2025", "10/7/2025","10/9/2025"]
## Random free shipping
shipping = [True,False]

**Create our shipping table**

In [148]:
orders = []
## populate the row with random data
for i in range(1,rowNum + 1):
    row = {
        "Order Number": i + orderNum,
        "Order Value": round(rd.uniform(100,10000), 2),
        "Location": rd.choice(location),
        "Date": rd.choice(date),
        "Free Shipping": rd.choice(shipping)
    }
    ## append to the row
    orders.append(row)

**Convert to a DataFrame**

In [149]:
## create and print our dataframe
ordersDf = pd.DataFrame(orders)
ordersDf

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping
0,56678,8452.82,Miami,10/7/2025,True
1,56679,665.39,New York,10/9/2025,False
2,56680,5645.57,Billings,10/6/2025,False
3,56681,5496.3,Los Angeles,10/6/2025,True
4,56682,4258.65,Houston,10/9/2025,False
5,56683,7956.09,Los Angeles,10/7/2025,True
6,56684,3867.48,Billings,10/7/2025,True
7,56685,3449.08,Los Angeles,10/9/2025,True
8,56686,917.53,New York,10/7/2025,True
9,56687,3286.82,New York,10/5/2025,True


## **2. Data Exploration W/ Pandas**

In [150]:
## look at our first 5
firstArr = ordersDf.head(5)
## look at our last 5
lastArr = ordersDf.tail(5)
## print our first 5
firstArr

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping
0,56678,8452.82,Miami,10/7/2025,True
1,56679,665.39,New York,10/9/2025,False
2,56680,5645.57,Billings,10/6/2025,False
3,56681,5496.3,Los Angeles,10/6/2025,True
4,56682,4258.65,Houston,10/9/2025,False


In [151]:
## print our last 5
lastArr

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping
45,56723,6338.78,Billings,10/6/2025,True
46,56724,9862.93,Hartford,10/6/2025,False
47,56725,1969.26,New York,10/9/2025,False
48,56726,2617.84,New York,10/5/2025,True
49,56727,2203.68,Seattle,10/5/2025,True


In [152]:
## print our data types
ordersDf.dtypes

Order Number       int64
Order Value      float64
Location          object
Date              object
Free Shipping       bool
dtype: object

In [153]:
## print info as a data frame
ordersDf.describe()

Unnamed: 0,Order Number,Order Value
count,50.0,50.0
mean,56702.5,5089.798
std,14.57738,2977.72568
min,56678.0,665.39
25%,56690.25,2492.1325
50%,56702.5,4480.59
75%,56714.75,7923.3875
max,56727.0,9993.16


In [154]:
## Print the info as plain text
ordersDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order Number   50 non-null     int64  
 1   Order Value    50 non-null     float64
 2   Location       50 non-null     object 
 3   Date           50 non-null     object 
 4   Free Shipping  50 non-null     bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 1.7+ KB


In [155]:
## we can use .unique() to find unique values
print(ordersDf["Order Value"].unique())
## Seperator for cleanliness
print("\n")
## specifics per attribute
print(ordersDf["Location"].value_counts())
## Seperator for cleanliness
print("\n")
## or we can use .nunique for the whole table
print(ordersDf.nunique())

[8452.82  665.39 5645.57 5496.3  4258.65 7956.09 3867.48 3449.08  917.53
 3286.82 2741.46 1905.8  6323.6  6040.91 8489.65 3922.43  876.47 4950.8
 4103.31 3055.82 8529.53 9993.16 4702.53 5680.53  848.2  2266.5  2450.23
 2331.34 8362.61 1922.02 3102.97 9991.   6323.33 1460.7  9827.46 3727.37
 8941.91 6435.71 9253.06 7825.28 7546.88 1657.87 2995.95 8948.33 9966.96
 6338.78 9862.93 1969.26 2617.84 2203.68]


Location
New York       11
Billings        9
Houston         7
Hartford        7
Los Angeles     6
Miami           5
Seattle         5
Name: count, dtype: int64


Order Number     50
Order Value      50
Location          7
Date              4
Free Shipping     2
dtype: int64


## **3. Column Challenge**

> **We can add a column using PANDAS**

In [156]:
## Create our new column with a calculated value
ordersDf["Delivered"] = (ordersDf["Free Shipping"] != True) & (ordersDf["Date"] >= "10/7/2025")
## Print
ordersDf

Unnamed: 0,Order Number,Order Value,Location,Date,Free Shipping,Delivered
0,56678,8452.82,Miami,10/7/2025,True,False
1,56679,665.39,New York,10/9/2025,False,True
2,56680,5645.57,Billings,10/6/2025,False,False
3,56681,5496.3,Los Angeles,10/6/2025,True,False
4,56682,4258.65,Houston,10/9/2025,False,True
5,56683,7956.09,Los Angeles,10/7/2025,True,False
6,56684,3867.48,Billings,10/7/2025,True,False
7,56685,3449.08,Los Angeles,10/9/2025,True,False
8,56686,917.53,New York,10/7/2025,True,False
9,56687,3286.82,New York,10/5/2025,True,False


In [157]:
## Lets save this to a CSV file,
ordersDf.to_csv("MaxShuford_Lab1_Columns.csv")