In [2]:
import pandas as pd 
import glob
pd.options.mode.chained_assignment = None  # default='warn'
all_day_files = glob.glob("day_data/*.csv")
data_frames = []
for file_name in all_day_files:
    df = pd.read_csv(file_name, dtype={'favorite_payload': str})
    df['day_file'] = file_name
    data_frames.append(df) 

def find_mode(series):
    mode_df = series.mode()
    if not mode_df.empty:
        return mode_df.iloc[0]
    else:
        return pd.NA  # Return NaN if no mode is found
        
combined_df = pd.concat(data_frames, ignore_index=True)
aggregations = {
    'subnet': ('subnet', 'first'),
    'active_days': ('day_file', 'nunique'),
    'asn': ('asn', 'first'), 
    'country': ('country', 'first'),
    'city': ('city', 'first'),
    'scan_length_seconds': ('scan_length_seconds', 'mean'),
    'median_time_diff': ('median_time_diff', 'mean'),
    'distinct_src_ports': ('distinct_src_ports', 'mean'),
    'distinct_dest_ports': ('distinct_dest_ports', 'mean'),
    'q1_prev_ip': ('q1_prev_ip', 'mean'),
    'median_prev_ip': ('median_prev_ip', 'mean'),
    'q3_prev_ip': ('q3_prev_ip', 'mean'),
    'distinct_ips': ('distinct_ips', 'mean'),
    'total_hits': ('total_hits', 'mean'),
    'probes_per_ip': ('probes_per_ip', 'mean'),
    'distinct_fingerprints': ('distinct_fingerprints', 'mean'),
    'distinct_payloads': ('distinct_payloads', 'mean'),
    'avg_payload_length': ('avg_payload_length', 'mean'),
    'q1q3_delta': ('q1q3_delta', 'mean'),
    # find most frequent value
    'top_port': ('top_port', find_mode),
    'top_fingerprint': ('top_fingerprint', find_mode),
    'generation_algorithm': ('generation_algorithm', find_mode),
    'top_start_hour': ('start_hour', find_mode),
    'top_end_hour': ('end_hour', find_mode)
}

aggregated_df = combined_df.groupby('src_ip').agg(**aggregations).reset_index()
clean_df = aggregated_df.dropna(subset=['median_time_diff'])
clean_df.to_csv('3_weeks.csv')
print(clean_df)


              src_ip        subnet  active_days      asn country  \
0       1.681231e+07  1.681229e+07            3  23969.0      TH   
1       1.681427e+07  1.681408e+07            2  23969.0      TH   
2       1.681438e+07  1.681434e+07            1  23969.0      TH   
3       1.681454e+07  1.681434e+07            1  23969.0      TH   
4       1.681460e+07  1.681459e+07            2  23969.0      TH   
...              ...           ...          ...      ...     ...   
921886  3.758089e+09  3.758089e+09            1  45727.0      ID   
921887  3.758090e+09  3.758089e+09            1  45727.0      ID   
921888  3.758090e+09  3.758090e+09            1  45727.0      ID   
921889  3.758090e+09  3.758090e+09            1  45727.0      ID   
921890  3.758093e+09  3.758093e+09            1  45117.0      IN   

                 city  scan_length_seconds  median_time_diff  \
0       Nakhon Pathom               9670.0            3000.0   
1         Khlong Thom              39334.0         1189

In [28]:
gen1 = aggregated_df[aggregated_df['active_days'] == 6]
print(gen1)

            src_ip      subnet  active_days    asn country        city  \
244       16962648    16962560            6  23969      TH  Nonthaburi   
372       17470127    17469952            6  23969      TH     Bangkok   
570       18205419    18205184            6   2519      JP     Okazaki   
571       18250766    18250752            6  45528      IN      Kanpur   
936       18432435    18432256            6   4837      CN        None   
...            ...         ...          ...    ...     ...         ...   
921076  3754409980  3754409728            6   4134      CN        None   
921083  3754692472  3754692352            6  45758      TH   Khon Kaen   
921154  3754781820  3754781696            6  45758      TH       Trang   
921239  3754882623  3754882560            6  45758      TH   Khon Kaen   
921508  3756605881  3756605696            6  24560      IN       Delhi   

        scan_length_seconds  median_time_diff  distinct_src_ports  \
244            60602.000000      1.211667e