In [None]:
# Task 2: Executing Hive Queries on the UNSW-NB15 Dataset

# Import necessary libraries
from pyhive import hive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Establish a connection to the Hive server
conn = hive.Connection(host='localhost', port=10000, username='hadoop')

# Function to execute a Hive query and return the result as a Pandas DataFrame
def execute_hive_query(query):
    with conn.cursor() as cursor:
        cursor.execute(query)
        result = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
    return pd.DataFrame(result, columns=columns)

# Example Query 1: Count the number of records in the dataset
query1 = """
SELECT COUNT(*)
FROM unsw_nb15
"""
result1 = execute_hive_query(query1)
print("Total number of records in the dataset:", result1.iloc[0, 0])

# Example Query 2: Count the number of records for each label
query2 = """
SELECT label, COUNT(*)
FROM unsw_nb15
GROUP BY label
"""
result2 = execute_hive_query(query2)
print(result2)

# Visualize the distribution of labels
plt.figure(figsize=(10, 6))
sns.barplot(x='label', y='count(1)', data=result2)
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

# Example Query 3: Average source bytes (sbytes) for each label
query3 = """
SELECT label, AVG(sbytes) AS avg_sbytes
FROM unsw_nb15
GROUP BY label
"""
result3 = execute_hive_query(query3)
print(result3)

# Visualize the average source bytes for each label
plt.figure(figsize=(10, 6))
sns.barplot(x='label', y='avg_sbytes', data=result3)
plt.title('Average Source Bytes for Each Label')
plt.xlabel('Label')
plt.ylabel('Average Source Bytes')
plt.show()

# Example Query 4: Top 10 source IPs by number of connections
query4 = """
SELECT srcip, COUNT(*) AS connections
FROM unsw_nb15
GROUP BY srcip
ORDER BY connections DESC
LIMIT 10
"""
result4 = execute_hive_query(query4)
print(result4)

# Visualize the top 10 source IPs by number of connections
plt.figure(figsize=(12, 8))
sns.barplot(x='connections', y='srcip', data=result4)
plt.title('Top 10 Source IPs by Number of Connections')
plt.xlabel('Number of Connections')
plt.ylabel('Source IP')
plt.show()

# Close the Hive connection
conn.close()
