# VRV Security’s Python Intern Assignment

In [34]:
import pandas as pd
from collections import Counter

**Path to the file**

In [35]:

log_file_path = "/kaggle/input/sample-log1/sample.log.csv"

**Read the log file and Display the DataFrame**

In [36]:
with open(log_file_path, "r") as file:
    logs = file.readlines()

parsed_data = []
for log in logs:
    parts = log.split()

    ip_address = parts[0]  
    timestamp = log.split("[")[1].split("]")[0] 
    request = " ".join(parts[5:8]).replace('"', '') 
    status_code = parts[8]  
    size = parts[9]  
    error_message = " ".join(parts[10:]).strip() if len(parts) > 10 else None  

    # Add the parsed data as a row
    parsed_data.append([ip_address, timestamp, request, status_code, size, error_message])


columns = ["IP Address", "Timestamp", "Request", "Status Code", "Size", "Error Message"]

df = pd.DataFrame(parsed_data, columns=columns)

output_file = "log_table.csv"
df.to_csv(output_file, index=False)

df.head()

Unnamed: 0,IP Address,Timestamp,Request,Status Code,Size,Error Message
0,192.168.1.1,03/Dec/2024:10:12:34 +0000,GET /home HTTP/1.1,200,512,
1,203.0.113.5,03/Dec/2024:10:12:35 +0000,POST /login HTTP/1.1,401,128,"""Invalid credentials"""
2,10.0.0.2,03/Dec/2024:10:12:36 +0000,GET /about HTTP/1.1,200,256,
3,192.168.1.1,03/Dec/2024:10:12:37 +0000,GET /contact HTTP/1.1,200,312,
4,198.51.100.23,03/Dec/2024:10:12:38 +0000,POST /register HTTP/1.1,200,128,


Your Python script should implement the following functionalities:

1. **Count Requests per IP Address**:
    - Parse the provided log file to extract all IP addresses.
    - Calculate the number of requests made by each IP address.
    - Sort and display the results in descending order of requestt counts.**

In [37]:

ip_request_counts = df["IP Address"].value_counts().reset_index()
ip_request_counts.columns = ["IP Address", "Request Count"]
ip_request_counts = ip_request_counts.sort_values(by="Request Count", ascending=False)
print(ip_request_counts)

      IP Address  Request Count
0    203.0.113.5              8
1  198.51.100.23              8
2    192.168.1.1              7
3       10.0.0.2              6
4  192.168.1.100              5


**2. **Identify the Most Frequently Accessed Endpoint**:
    - Extract the endpoints (e.g., URLs or resource paths) from the log file.
    - Identify the endpoint accessed the highest number of times.
    - Provide the endpoint name and its access count.**

In [38]:
df['Endpoint'] = df['Request'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else None)
endpoint_counts = df['Endpoint'].value_counts()
most_frequent_endpoint = endpoint_counts.idxmax()
count = endpoint_counts.max()
print(f"Most Frequently Accessed Endpoint:\n{most_frequent_endpoint} (Accessed {count} times)")


Most Frequently Accessed Endpoint:
/login (Accessed 13 times)


In [39]:
threshold = 10
failed_logins = df[(df['Status Code'] == '401') | (df['Error Message'].str.contains("Invalid credentials", na=False))]
failed_login_counts = failed_logins['IP Address'].value_counts()
suspicious_activity = failed_login_counts[failed_login_counts > threshold]

if not suspicious_activity.empty:
    print("\nSuspicious Activity Detected:")
    suspicious_df = pd.DataFrame({
        'IP Address': suspicious_activity.index,
        'Failed Login Attempts': suspicious_activity.values
    })
    print(suspicious_df)
    suspicious_df.to_csv('log_analysis_results.csv', index=False, mode='a', header=False)
else:
    print("\nNo suspicious activity detected.")



No suspicious activity detected.


**Since the threshold is set to 10 and the highest and lowest number of failed login attempts in the provided sample logs is 8 and 5, there are no IP addresses flagged for suspicious activity**

In [40]:
threshold = 2
failed_logins = df[(df['Status Code'] == '401') | (df['Error Message'].str.contains("Invalid credentials", na=False))]
failed_login_counts = failed_logins['IP Address'].value_counts()

suspicious_activity = failed_login_counts[failed_login_counts > threshold]


if not suspicious_activity.empty:
    suspicious_df = pd.DataFrame({
        'IP Address': suspicious_activity.index,
        'Failed Login Attempts': suspicious_activity.values
    })
    print("Suspicious Activity Detected:")
    print(suspicious_df)
else:
    print("No suspicious activity detected.")


Suspicious Activity Detected:
      IP Address  Failed Login Attempts
0    203.0.113.5                      8
1  192.168.1.100                      5


**4. **Output Results**:
    - Display the results in a clear, organized format in the terminal.
    - Save the results to a CSV file named `log_analysis_results.csv` with the following structure:
        - **Requests per IP**: Columns: `IP Address`, `Request Count`
        - **Most Accessed Endpoint**: Columns: `Endpoint`, `Access Count`
        - **Suspicious Activity**: Columns: `IP Address`, `Failed Login Count`**

In [41]:
# Save results to CSV file
output_file = "log_analysis_results.csv"

# Write the Requests per IP data
ip_request_counts.to_csv(output_file, index=False, header=True)

# Write the Most Accessed Endpoint data
with open(output_file, 'a') as f:
    f.write("\nMost Accessed Endpoint\n")
    f.write(f"Endpoint,Access Count\n{most_frequent_endpoint},{count}\n")

# Write the Suspicious Activity data
with open(output_file, 'a') as f:
    f.write("\nSuspicious Activity\n")
    if not suspicious_activity.empty:
        suspicious_df.to_csv(f, index=False, header=True)
    else:
        f.write("IP Address,Failed Login Count\nNone,None\n")