In [238]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [239]:
url_df = pd.read_csv("all_urls.csv")

In [240]:
url_df

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [241]:
only_benign = url_df.loc[url_df['type'] == 'benign']

In [242]:
# isolating all the bad urls
only_malicious = url_df.loc[(url_df['type'] == 'phishing') | (url_df['type'] == 'malware') | (url_df['type'] == 'defacement')]

In [243]:
v = TfidfVectorizer(analyzer='char')
x = v.fit_transform(only_malicious['url'])
y = v.fit_transform(only_benign['url'])

In [244]:
def display_scores(vectorizer, tfidf_result_malicious, tfidf_result_benign):
    # http://stackoverflow.com/questions/16078015/
    scores_malicious = set(zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result_malicious.sum(axis=0)).ravel()))
    scores_benign = set(zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result_benign.sum(axis=0)).ravel()))
    sorted_strictly_malicious = sorted(scores_malicious.difference(scores_benign), key = lambda x: x[1], reverse=True)
    for item in sorted_strictly_malicious:
        print("{0:50} Score: {1}".format(item[0], item[1]))

In [245]:
# displaying top features of malicious urls
display_scores(v, x, y)

                                                  Score: 52138.096944768746
r                                                  Score: 49592.12282723011
<                                                  Score: 43611.67457081842
|                                                  Score: 41787.367735695625
v                                                  Score: 39413.517236784035
n                                                  Score: 36614.993137798556
;                                                  Score: 35685.594248607595
                                                  Score: 35407.99891787387
{                                                  Score: 34415.03676393541
}                                                  Score: 32125.44500635107
p                                                  Score: 31633.79738831132
                                                  Score: 30655.897049646606
                                                  Score: 30322.87397156014
z     

In [246]:
badwords = ['sleep', 'drop', 'uid', 'select', 'waitfor', 'delay', 'system', 'union', 'order by', 'group by']

In [247]:
def count_suspicious_chars(row):
    unicode_count, inequality_count, pipe_count, semicolon_count, curlybrace_count, caret_count = 0, 0, 0, 0, 0, 0
    for char in row['url']:
        if char == '\x83' or char == '\x88' or char == '\x85':
            unicode_count += 1
        elif char == '<' or char == '>':
            inequality_count += 1
        elif char == '|':
            pipe_count += 1
        elif char == ';':
            semicolon_count += 1
        elif char == '{':
            curlybrace_count += 1
        elif char == '^':
            caret_count += 1
            
    return unicode_count, inequality_count, pipe_count, semicolon_count, curlybrace_count, caret_count

In [248]:
all_counts = url_df.apply(lambda row: count_suspicious_chars(row), axis=1)

In [249]:
url_df['unicode'] = [tup[0] for tup in all_counts]
url_df['inequality'] = [tup[1] for tup in all_counts]
url_df['pipe'] = [tup[2] for tup in all_counts]
url_df['semicolon'] = [tup[3] for tup in all_counts]
url_df['curlybrace'] = [tup[4] for tup in all_counts]
url_df['caret'] = [tup[5] for tup in all_counts]

In [250]:
url_df

Unnamed: 0,url,type,unicode,inequality,pipe,semicolon,curlybrace,caret
0,br-icloud.com.br,phishing,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,0,0,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,0,0,0,0,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,0,0,0,0,0,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,0,0,0,0,0,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,0,0,0,0,0,0


In [251]:
url_df.to_csv("processed_urls.csv")