In [3]:
import pandas as pd
import warnings

# Suppress DtypeWarnings for a cleaner output
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)


# Load the merged sessions file
# Note: You may need to use the chunking method if the file is too large
try:
    merged_sessions_df = pd.read_csv('../outputs/merged_sessions.csv')
    num_rows = merged_sessions_df.shape[0]
    print(f"The number of rows in merged_sessions.csv is: {num_rows}")
except Exception as e:
    print(f"An error occurred: {e}")
    print("The file might be too large. Try reading it in chunks to get the row count.")
    # Fallback to a chunking method for large files
    chunk_size = 10000
    total_rows = 0
    for chunk in pd.read_csv('outputs/merged_sessions.csv', chunksize=chunk_size):
        total_rows += len(chunk)
    print(f"The number of rows (using chunking) is: {total_rows}")

The number of rows in merged_sessions.csv is: 17701957


In [1]:
import pandas as pd
import os

# --- Configuration ---
# Set the path to your large CSV file
large_file_path = '../data/email.csv' 
# Set the path to the directory where you want to save the smaller files
output_dir = '../splunk_inputs/email_chunks' 
# Define the size of each chunk (in rows)
# 500,000 rows should produce a file well under the 500 MB limit
chunk_size = 500000 

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- Splitting the file ---
chunk_number = 0
for chunk in pd.read_csv(large_file_path, chunksize=chunk_size):
    chunk_number += 1
    # Define the output file name
    output_file_name = f'email_chunk_{chunk_number}.csv'
    output_path = os.path.join(output_dir, output_file_name)
    
    # Save the chunk to a new CSV file
    chunk.to_csv(output_path, index=False)
    
    print(f"Saved chunk {chunk_number} to {output_path}")

print("All chunks have been saved successfully.")

Saved chunk 1 to ../splunk_inputs/email_chunks\email_chunk_1.csv
Saved chunk 2 to ../splunk_inputs/email_chunks\email_chunk_2.csv
Saved chunk 3 to ../splunk_inputs/email_chunks\email_chunk_3.csv
Saved chunk 4 to ../splunk_inputs/email_chunks\email_chunk_4.csv
Saved chunk 5 to ../splunk_inputs/email_chunks\email_chunk_5.csv
Saved chunk 6 to ../splunk_inputs/email_chunks\email_chunk_6.csv
Saved chunk 7 to ../splunk_inputs/email_chunks\email_chunk_7.csv
Saved chunk 8 to ../splunk_inputs/email_chunks\email_chunk_8.csv
Saved chunk 9 to ../splunk_inputs/email_chunks\email_chunk_9.csv
Saved chunk 10 to ../splunk_inputs/email_chunks\email_chunk_10.csv
Saved chunk 11 to ../splunk_inputs/email_chunks\email_chunk_11.csv
Saved chunk 12 to ../splunk_inputs/email_chunks\email_chunk_12.csv
Saved chunk 13 to ../splunk_inputs/email_chunks\email_chunk_13.csv
Saved chunk 14 to ../splunk_inputs/email_chunks\email_chunk_14.csv
Saved chunk 15 to ../splunk_inputs/email_chunks\email_chunk_15.csv
Saved chunk 1

In [2]:
import pandas as pd
import os

# --- Configuration ---
# Set the path to your large CSV file
large_file_path = '../data/file.csv' 
# Set the path to the directory where you want to save the smaller files
output_dir = '../splunk_inputs/file_chunks' 
# Define the size of each chunk (in rows)
# 500,000 rows should produce a file well under the 500 MB limit
chunk_size = 500000 

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- Splitting the file ---
chunk_number = 0
for chunk in pd.read_csv(large_file_path, chunksize=chunk_size):
    chunk_number += 1
    # Define the output file name
    output_file_name = f'email_chunk_{chunk_number}.csv'
    output_path = os.path.join(output_dir, output_file_name)
    
    # Save the chunk to a new CSV file
    chunk.to_csv(output_path, index=False)
    
    print(f"Saved chunk {chunk_number} to {output_path}")

print("All chunks have been saved successfully.")

Saved chunk 1 to ../splunk_inputs/file_chunks\email_chunk_1.csv
Saved chunk 2 to ../splunk_inputs/file_chunks\email_chunk_2.csv
Saved chunk 3 to ../splunk_inputs/file_chunks\email_chunk_3.csv
Saved chunk 4 to ../splunk_inputs/file_chunks\email_chunk_4.csv
Saved chunk 5 to ../splunk_inputs/file_chunks\email_chunk_5.csv
All chunks have been saved successfully.


In [None]:
import pandas as pd

# Assuming you have the email.csv file loaded as 'email_df'
email = pd.read_csv('../data/email.csv')

# Extract the domain from the 'from' email addresses
email['domain'] = email['from'].apply(lambda x: x.split('@')[1])

# Count the frequency of each domain
domain_counts = email['domain'].value_counts()

print("Top 10 most common domains:")
print(domain_counts.head(10))