In [16]:
import pandas as pd
import re  
from urllib.parse import urlparse
import whois
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [18]:
train_path = 'train.csv'
test_path = 'test.csv'
submission_path = 'sample_submission.csv'

train_df =  pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.head())
print(test_df.info())

   Unnamed: 0  sepal length (cm)  sepal width (cm)  petal length (cm)  \
0           0                5.1               3.5                1.4   
1           1                4.9               3.0                1.4   
2           2                4.7               3.2                1.3   
3           3                4.6               3.1                1.5   
4           4                5.0               3.6                1.4   

   petal width (cm)  
0               0.2  
1               0.2  
2               0.2  
3               0.2  
4               0.2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1747689 entries, 0 to 1747688
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   ID      object
 1   URL     object
dtypes: object(2)
memory usage: 26.7+ MB
None


In [None]:
def extract_features(df):
    df['url_length'] = df['url'].apply(len)  
    df['num_dots'] = df['url'].apply(lambda x: x.count('.')) 
    df['num_slashes'] = df['url'].apply(lambda x: x.count('/'))
    df['num_hyphens'] = df['url'].apply(lambda x: x.count('-'))  
    df['has_https'] = df['url'].apply(lambda x: 1 if 'https' in x else 0)  
    df['has_www'] = df['url'].apply(lambda x: 1 if 'www' in x else 0)  
    df['num_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))

    return df
train_df = extract_features(train_df)
test_df = extract_features(test_df)

print(train_df.head())

In [None]:
train_df["domain"] = train_df["url"].apply(lambda x: urlparse(x).netloc)
test_df["domain"] = test_df["url"].apply(lambda x: urlparse(x).netloc)


In [None]:
def get_domain_info(domain):
    try:
        domain_info = whois.whois(domain)
        return {
            "creation_date": domain_info.creation_date,
            "expiration_date": domain_info.expiration_date,
            "registrar": domain_info.registrar
        }
    except:
        return None

# WHOIS 데이터 추가
train_df["domain_age"] = train_df["domain"].apply(lambda x: get_domain_info(x)["creation_date"])
test_df["domain_age"] = test_df["domain"].apply(lambda x: get_domain_info(x)["creation_date"])


In [None]:
# 사용할 특징 선택
features = ['url_length', 'num_dots', 'num_slashes', 'num_hyphens', 'has_https', 'has_www', 'num_digits', 'domain_age', 'is_blacklisted']
X = train_df[features]
y = train_df['label']  # 실제 정답 라벨

# 데이터 분할 (학습용, 검증용)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 검증 데이터 예측
y_pred = clf.predict(X_val)

# 정확도 출력
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


In [None]:
# 테스트 데이터 예측
test_predictions = clf.predict(test_df[features])

# 제출 파일 생성
submission = pd.read_csv(submission_path)  # sample_submission.csv 불러오기
submission['label'] = test_predictions  # 예측 결과 넣기

# 저장
submission.to_csv("/mnt/data/submission.csv", index=False)
print("결과 저장 완료!")
