## Import Packages

In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.functions import col

In [4]:
from pyspark.sql.functions import desc

In [5]:
from pyspark.sql.functions import asc

In [6]:
import findspark

In [7]:
findspark.init()

In [8]:
spark = SparkSession.builder.appName('400MHurdles').getOrCreate()

## Import Data

In [11]:
Mens400mHurdles = spark.read.format('csv').option('inferSchema','true').option('header','true').option('path','Mens400mHurdles.csv').load()

In [12]:
Womens400mHurdles = spark.read.format('csv').option('inferSchema','true').option('header','true').option('path','Womens400mHurdles.csv').load()

In [13]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Create View

In [14]:
Mens400mHurdles.createOrReplaceTempView("Mens400mHurdles")

In [15]:
Womens400mHurdles.createOrReplaceTempView("Womens400mHurdles")

In [None]:
## Basic Selection

In [None]:
resultone = spark.sql("SELECT * FROM Mens400mHurdles")
resultone.show()

In [None]:
resulttwo = spark.sql("SELECT * FROM Womens400mHurdles")
resulttwo.show()

In [None]:
## Query One (Times)

In [None]:
QueryOneA = spark.sql("SELECT Times, COUNT(*) as count FROM Mens400mHurdles GROUP BY Times ORDER BY count DESC")
QueryOneA.show()

In [None]:
QueryOneB = spark.sql("SELECT Times, COUNT(*) as count FROM Womens400mHurdles GROUP BY Times ORDER BY count DESC")
QueryOneB.show()

In [None]:
## Query Two (Schools)

In [None]:
QueryTwoA = spark.sql("SELECT Schools, COUNT(*) as count FROM Mens400mHurdles GROUP BY Schools ORDER BY count DESC")
QueryTwoA.show()

In [None]:
QueryTwoB = spark.sql("SELECT Schools, COUNT(*) as count FROM Womens400mHurdles GROUP BY Schools ORDER BY count DESC")
QueryTwoB.show()

In [None]:
## Query Three (Times Percentages)

In [None]:
QueryThreeA = spark.sql("SELECT Times, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Mens400mHurdles)) * 100 as percentage FROM Mens400mHurdles GROUP BY Times")
QueryThreeA.show()

In [None]:
QueryThreeB = spark.sql("SELECT Times, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Womens400mHurdles)) * 100 as percentage FROM Womens400mHurdles GROUP BY Times")
QueryThreeB.show()

In [None]:
## Query Four (School Percentages)

In [None]:
QueryFourA = spark.sql("SELECT Schools, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Mens400mHurdles)) * 100 as percentage FROM Mens400mHurdles GROUP BY Schools")
QueryFourA.show()

In [None]:
QueryFourB = spark.sql("SELECT Schools, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Womens400mHurdles)) * 100 as percentage FROM Mens400mHurdles GROUP BY Schools")
QueryFourB.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
## Import Data 

In [None]:
Mens400mHurdlesDF = pd.read_csv('Men400mHurdles.csv')
Mens400mHurdlesDF

In [None]:
Womens400mHurdlesDF = pd.read_csv('Womens400mHurdles.csv')
Womens400mHurdlesDF

## Create a histogram using Seaborn

In [None]:
# Create a histogram using Seaborn
column_name = "Times"
sns.histplot(Mens400mHurdlesDF[column_name], bins=20, kde=True, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

# Show the plot
plt.show()

In [None]:
# Create a histogram using Seaborn
column_name = "Times"
sns.histplot(Womens400mHurdlesDF[column_name], bins=20, kde=True, color='red', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

# Show the plot
plt.show()

In [None]:
# Create a histogram using Seaborn
column_name = "Schools"
sns.histplot(Mens400mHurdlesDF[column_name], bins=20, kde=True, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

#Make names more readable
plt.xticks(rotation=60)


# Show the plot
plt.show()

In [None]:
# Create a histogram using Seaborn
column_name = "Schools"
sns.histplot(Womens400mHurdlesDF[column_name], bins=20, kde=True, color='red', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

#Make names more readable
plt.xticks(rotation=60)

# Show the plot
plt.show()