In [None]:
## Import Packages

In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.functions import col

In [4]:
from pyspark.sql.functions import desc

In [5]:
from pyspark.sql.functions import asc

In [6]:
import findspark

In [7]:
findspark.init()

In [8]:
spark = SparkSession.builder.appName('4x100mRelay').getOrCreate()

In [None]:
## Import Data

In [9]:
Mens4x100m = spark.read.format('csv').option('inferSchema','true').option('header','true').option('path','Men4x100m.csv').load()

In [10]:
Womens4x100m = spark.read.format('csv').option('inferSchema','true').option('header','true').option('path','Womens4x100m.csv').load()

In [11]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [None]:
## Create Views

In [12]:
Mens4x100m.createOrReplaceTempView("Mens4x100m")

In [13]:
Womens4x100m.createOrReplaceTempView("Womens4x100m")

## Basic Selection

In [None]:
resultone = spark.sql("SELECT * FROM Mens4x100m")
resultone.show()

In [None]:
resulttwo = spark.sql("SELECT * FROM Womens4x100m")
resulttwo.show()

## Query One (Times)

In [None]:
QueryOneA = spark.sql("SELECT OfficialTimes, COUNT(*) as count FROM Mens4x100m GROUP BY OfficialTimes ORDER BY count DESC")
QueryOneA.show()

In [None]:
QueryOneB = spark.sql("SELECT OfficialTimes, COUNT(*) as count FROM Womens4x100m GROUP BY OfficialTimes ORDER BY count DESC")
QueryOneB.show()

## Query Two (Team Names)

In [None]:
QueryTwoA = spark.sql("SELECT Names, COUNT(*) as count FROM Mens4x100m GROUP BY Names ORDER BY count DESC")
QueryTwoA.show()

In [None]:
QueryTwoB = spark.sql("SELECT Names, COUNT(*) as count FROM Womens4x100m GROUP BY Names ORDER BY count DESC")
QueryTwoB.show()

## Query Three (Percentage of Times)

In [None]:
QueryThreeA = spark.sql("SELECT OfficialTimes, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Mens4x100m)) * 100 as percentage FROM Mens4x100m GROUP BY OfficialTimes")
QueryThreeA.show()

In [None]:
QueryThreeB = spark.sql("SELECT OfficialTimes, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Womens4x100m)) * 100 as percentage FROM Womens4x100m GROUP BY OfficialTimes")
QueryThreeB.show()

## Query Four (Percentage of Schools)

In [None]:
QueryFourA = spark.sql("SELECT Names, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Mens4x100m)) * 100 as percentage FROM Mens4x100m GROUP BY Names")
QueryFourA.show()

In [None]:
QueryFourB = spark.sql("SELECT Names, COUNT(*) as count, (COUNT(*) / (SELECT COUNT(*) FROM Womens4x100m)) * 100 as percentage FROM Womens4x100m GROUP BY Names")
QueryFourB.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing CSVs

In [None]:
Mens4x100m = pd.read_csv('Men4x100m.csv')
Mens4x100m

In [None]:
Womens4x100m = pd.read_csv('Womens4x100m.csv')
Womens4x100m

## Create a histogram using Seaborn

In [None]:
# Create a histogram using Seaborn
column_name = "OfficialTimes"
sns.histplot(Mens4x100m[column_name], bins=20, kde=True, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

# Show the plot
plt.show()

In [None]:
# Create a histogram using Seaborn
column_name = "OfficialTimes"
sns.histplot(Womens4x100m[column_name], bins=20, kde=True, color='red', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

# Show the plot
plt.show()

In [None]:
# Create a histogram using Seaborn
column_name = "Names"
sns.histplot(Mens4x100m[column_name], bins=20, kde=True, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

#Make names more readable
plt.xticks(rotation=60)


# Show the plot
plt.show()

In [None]:
# Create a histogram using Seaborn
column_name = "Names"
sns.histplot(Mens4x100m[column_name], bins=20, kde=True, color='red', edgecolor='black')

# Add labels and title
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Distribution of {column_name}')

#Make names more readable
plt.xticks(rotation=60)

# Show the plot
plt.show()