# 1.Data Preprocessing

In [1]:
import os
import sys
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_nb
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_content(file_path):
    path = os.path.join('../input/file-format-detection/dataset', file_path)
    encodings = ['utf-8', 'cp949']
    for enc in encodings:
        try:
            f = open(path, 'r', encoding=enc)
            ret = '\n'.join(list(map(lambda s: s.rstrip(), f.readlines())))
            f.close()
            return ret
        except UnicodeDecodeError:
            continue
        except:
            break
        finally:
            if f != None: f.close()
    return None

def add_file_content(df):
    contents = []
    for i, row in tqdm_nb(df.iterrows(), total=len(df), desc='read files'):
        contents.append(get_content(row['file_path']))
    df['file_content'] = contents
    return df

df_full = pd.read_csv('../input/file-format-detection/dataset.csv')
df_ext_count = df_full.groupby('extension').count().sort_values(by='id')
df_lang_count = df_full.groupby('language').count().sort_values(by='id')
df_filesizes = df_full.groupby('language').sum().sort_values(by='file_size')
languages = list(df_lang_count[df_lang_count > 500].dropna().index)
print('Train and predict for only:', languages)

Train and predict for only: ['YAML', 'Elixir', 'GAS', 'GLSL', 'Julia', 'Diff', 'C', 'SQL', 'PHP', 'C++', 'Text', 'Java', 'Markdown', 'Ruby', 'Javascript', 'Kotlin', 'JSON', 'Go', 'C#', 'Rust', 'Dart']


In [3]:
df = add_file_content(df_full[df_full.language.isin(languages)].sample(frac=0.5))
# print('List by failed to read contents:\n', df[df['file_content'].isna()][['file_path', 'language', 'file_size']])
df = df.dropna()
df

read files:   0%|          | 0/40773 [00:00<?, ?it/s]

Unnamed: 0,id,file_path,file_size,line_count,extension,language,file_content
38070,38071,Go/038071.go,1133,61,go,Go,// Copyright 2012 The Go Authors. All rights r...
26568,26569,Dart/026569.dart,751,33,dart,Dart,"// Copyright (c) 2021, the Dart project author..."
63303,63304,Ruby/063304.rb,668,21,rb,Ruby,class AddServiceNameToActiveStorageBlobs < Act...
1178,1179,JSON/001179.json,3779,79,json,JSON,"{\n ""Entries"": [\n {\n ""RequestUri"": ..."
57845,57846,Java/057846.java,4656,136,java,Java,/*\n * Minecraft Forge - Forge Development LLC...
...,...,...,...,...,...,...,...
36375,36376,Elixir/036376.exs,883,35,exs,Elixir,defmodule Logger.MixProject do\n use Mix.Proj...
1186,1187,JSON/001187.json,4578,74,json,JSON,"{\n ""Entries"": [\n {\n ""RequestUri"": ..."
83533,83534,Javascript/083534.frag,359,13,frag,Javascript,"#version 450\n\nlayout(binding = 0, rgba8) uni..."
34779,34780,Dart/034780.dart,21226,774,dart,Dart,"// Copyright (c) 2020, the Dart project author..."


# 2.BERT

In [6]:
import re
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

def process_text(text):
    text = text.lower()
    text = re.sub("https*\S+", "[ URL ]", text)
    # remove extra spaces
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    text = re.sub('[^a-zA-Z\s]', '', text)
    return text

X = [item[0:10000] for item in df['file_content'].apply(process_text).values]
category_to_index = {category: idx for idx, category in enumerate(df['language'].unique())}
Y = df['language'].map(category_to_index).values

In [7]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
import torch

train_texts, temp_texts, train_labels, temp_labels = train_test_split(X, Y, test_size=0.3, random_state=42)
valid_texts, test_texts, valid_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.67, random_state=42)


In [8]:
#BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# train, valid ,test
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=21)  # 假设有5个类别

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
# 将编码后的数据转换为PyTorch张量
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

valid_input_ids = torch.tensor(valid_encodings['input_ids'])
valid_attention_mask = torch.tensor(valid_encodings['attention_mask'])
valid_labels = torch.tensor(valid_labels)

test_input_ids = torch.tensor(test_encodings['input_ids'])
test_attention_mask = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)


batch_size = 16
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(valid_input_ids, valid_attention_mask, valid_labels)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
epochs = 5
best_valid_loss = float('inf')
for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    for batch in train_loader:
        batch_input_ids, batch_attention_mask, batch_labels = batch
        
        batch_input_ids = batch_input_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, 平均训练损失: {avg_train_loss:.4f}')
    
    # 在验证集上评估
    model.eval()
    total_valid_loss = 0.0
    with torch.no_grad():
        for batch in valid_loader:
            batch_input_ids, batch_attention_mask, batch_labels = batch
            
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            batch_labels = batch_labels.to(device)
            
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss
            total_valid_loss += loss.item()
    
    avg_valid_loss = total_valid_loss / len(valid_loader)
    print(f'Epoch {epoch + 1}/{epochs}, 平均验证损失: {avg_valid_loss:.4f}')
    
    if avg_valid_loss < best_valid_loss:
        best_valid_loss = avg_valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print('save the best')
    model.train()

Epoch 1/5, 平均训练损失: 0.4134
Epoch 1/5, 平均验证损失: 0.1163
save the best
Epoch 2/5, 平均训练损失: 0.0811
Epoch 2/5, 平均验证损失: 0.0798
save the best
Epoch 3/5, 平均训练损失: 0.0484
Epoch 3/5, 平均验证损失: 0.0695
save the best
Epoch 4/5, 平均训练损失: 0.0364
Epoch 4/5, 平均验证损失: 0.0799
Epoch 5/5, 平均训练损失: 0.0294
Epoch 5/5, 平均验证损失: 0.0695


In [16]:
from sklearn import metrics

model.load_state_dict(torch.load('best_model.pt'))
model.eval()

# 在测试集上评估模型
test_preds = []
test_labels_list = []
with torch.no_grad():
    for batch in test_loader:
        batch_input_ids, batch_attention_mask, batch_labels = batch
        
        batch_input_ids = batch_input_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        batch_labels = batch_labels.to(device)
            
        # 前向传播
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        
        # 预测类别
        _, predicted = torch.max(logits, dim=1)
        
        # 将预测结果和真实标签添加到列表中
        test_preds.extend(predicted.cpu().numpy())
        test_labels_list.extend(batch_labels.cpu().numpy())

        
# 计算精确度、召回率和F1分数

print(metrics.classification_report(test_preds, test_labels_list, target_names=sorted(category_to_index, key=lambda x: category_to_index[x])))

              precision    recall  f1-score   support

          Go       1.00      0.99      0.99       968
        Dart       1.00      0.99      1.00      1561
        Ruby       0.99      1.00      0.99       330
        JSON       1.00      0.96      0.98       564
        Java       0.96      0.99      0.98       162
        Rust       0.99      0.99      0.99      1443
          C#       1.00      1.00      1.00      1016
    Markdown       0.91      0.94      0.92       224
       Julia       0.96      1.00      0.98        81
  Javascript       0.98      0.95      0.97       386
         PHP       0.98      1.00      0.99       141
        YAML       0.69      0.97      0.81        37
      Kotlin       0.99      0.99      0.99       552
           C       0.63      0.93      0.75        59
        Text       0.96      0.92      0.94       211
      Elixir       0.96      0.98      0.97        44
        Diff       1.00      1.00      1.00        77
         GAS       0.98    

# (1) Bi-LSTM

In [17]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
tokenizer=Tokenizer(num_words= 10000,lower=True)
num_words = 10000
X = df['file_content']
Y = df['language']
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X) 
X = pad_sequences(X,maxlen=100,padding='post') 
y=pd.get_dummies(Y)
X_train, X_valid_test, y_train, y_valid_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid_test, y_valid_test, test_size = 0.66, random_state = 42)

In [18]:
import tensorflow
import numpy as np
from sklearn import metrics
from tensorflow.keras.models import Sequential 
from keras.layers import Flatten
from tensorflow.keras.layers import Dense, LSTM, Embedding,Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.layers import Dropout

y_encoded = np.argmax(y.values, axis=1)
mapping = {}
for label, code in zip(Y, y_encoded):
    mapping[label] = code
sorted_keys = sorted(mapping, key=lambda x: mapping[x])


def traing(model):
    es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
    mc = ModelCheckpoint('./'+str(model)+'-model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
    history_embedding = model.fit(X_train, y_train, 
                                    epochs = 5, batch_size = 64, 
                                    validation_data=(X_valid, y_valid),
                                    verbose = 1, callbacks= [es, mc]  )
    return model

def testing(model):
    y_pred =  np.argmax(model.predict(X_test), axis  =  1)
    y_true = np.argmax(y_test.values, axis = 1)
    print(metrics.classification_report(y_pred, y_true, target_names=sorted_keys))

In [19]:
def get_BiLSTM():
    EMBEDDING_DIM = 100
    model = Sequential()
    model.add(Embedding(input_dim = num_words,
     output_dim = EMBEDDING_DIM,
     input_length= X.shape[1]))
    model.add(Bidirectional(CuDNNLSTM(100,return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(CuDNNLSTM(200,return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(100,activation = 'relu'))
    model.add(Dense(21, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd',metrics = 'accuracy')
    return model

In [20]:
lstm_model = traing(get_BiLSTM())

Epoch 1/5

Epoch 00001: val_accuracy improved from -inf to 0.30815, saving model to ./<keras.engine.sequential.Sequential object at 0x7892397cc650>-model.h5
Epoch 2/5

Epoch 00002: val_accuracy improved from 0.30815 to 0.36565, saving model to ./<keras.engine.sequential.Sequential object at 0x7892397cc650>-model.h5
Epoch 3/5

Epoch 00003: val_accuracy improved from 0.36565 to 0.48930, saving model to ./<keras.engine.sequential.Sequential object at 0x7892397cc650>-model.h5
Epoch 4/5

Epoch 00004: val_accuracy improved from 0.48930 to 0.51888, saving model to ./<keras.engine.sequential.Sequential object at 0x7892397cc650>-model.h5
Epoch 5/5

Epoch 00005: val_accuracy improved from 0.51888 to 0.64686, saving model to ./<keras.engine.sequential.Sequential object at 0x7892397cc650>-model.h5


In [21]:
testing(lstm_model)

              precision    recall  f1-score   support

           C       0.00      0.00      0.00         0
          C#       0.95      0.87      0.91      1095
         C++       0.00      0.00      0.00         0
        Dart       0.86      0.87      0.86      1510
        Diff       0.00      0.00      0.00         0
      Elixir       0.00      0.00      0.00         0
         GAS       0.00      0.00      0.00         0
        GLSL       0.00      0.00      0.00         0
          Go       0.70      0.69      0.69       950
        JSON       0.93      0.64      0.76       765
        Java       0.00      0.00      0.00        14
  Javascript       0.28      0.34      0.30       298
       Julia       0.00      0.00      0.00         0
      Kotlin       0.80      0.33      0.47      1315
    Markdown       0.00      0.00      0.00         1
         PHP       0.00      0.00      0.00         0
        Ruby       0.44      0.39      0.41       372
        Rust       0.91    

# (2) CNN

In [22]:
from tensorflow.keras.layers import Input,Conv1D,MaxPooling1D,Dense,GlobalMaxPooling1D,Embedding
from tensorflow.keras.models import Model
from keras.models import Sequential
from keras.layers import Flatten
from keras.preprocessing import sequence

def get_CNN():
    EMBEDDING_DIM = 100
    model = Sequential()
    model.add(Embedding(input_dim = num_words,
     output_dim = EMBEDDING_DIM,
     input_length= X.shape[1]))
    model.add(Conv1D(128, 3, activation = 'relu'))
    model.add(MaxPooling1D(3))
    model.add(Conv1D(64,3,activation = 'relu'))
    model.add(Flatten())
    model.add(Dense(250,activation = 'relu'))
    model.add(Dense(21, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd',metrics = 'accuracy')
    return model

In [23]:
cnn_model = traing(get_CNN())

Epoch 1/5

Epoch 00001: val_accuracy improved from -inf to 0.26052, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f9a23490>-model.h5
Epoch 2/5

Epoch 00002: val_accuracy improved from 0.26052 to 0.26991, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f9a23490>-model.h5
Epoch 3/5

Epoch 00003: val_accuracy improved from 0.26991 to 0.38297, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f9a23490>-model.h5
Epoch 4/5

Epoch 00004: val_accuracy improved from 0.38297 to 0.49266, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f9a23490>-model.h5
Epoch 5/5

Epoch 00005: val_accuracy improved from 0.49266 to 0.52778, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f9a23490>-model.h5


In [24]:
testing(cnn_model)

              precision    recall  f1-score   support

           C       0.00      0.00      0.00         0
          C#       0.91      0.92      0.91      1001
         C++       0.00      0.00      0.00         0
        Dart       0.89      0.51      0.65      2653
        Diff       0.00      0.00      0.00         0
      Elixir       0.00      0.00      0.00         0
         GAS       0.00      0.00      0.00         0
        GLSL       0.00      0.00      0.00         0
          Go       0.52      0.47      0.49      1044
        JSON       0.43      0.92      0.59       250
        Java       0.00      0.00      0.00         0
  Javascript       0.05      0.59      0.10        34
       Julia       0.00      0.00      0.00         0
      Kotlin       0.03      0.35      0.06        49
    Markdown       0.00      0.00      0.00         0
         PHP       0.00      0.00      0.00         0
        Ruby       0.00      0.00      0.00         0
        Rust       0.96    

# (3) LSTM-CNN

In [25]:
def get_LSTM_CNN():
    EMBEDDING_DIM = 100
    model = Sequential()
    model.add(Embedding(input_dim = num_words,
     output_dim = EMBEDDING_DIM,
     input_length= X.shape[1]))
    model.add(Dropout(0.2))
    model.add(Bidirectional(CuDNNLSTM(100,return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(CuDNNLSTM(200,return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 3, activation = 'relu'))
    model.add(MaxPooling1D(3))
    model.add(Conv1D(64,3,activation = 'relu'))
    model.add(Flatten())
    model.add(Dense(250,activation = 'relu'))
    model.add(Dense(21, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd',metrics = 'accuracy')
    return model

In [26]:
lstm_cnn_model = traing(get_LSTM_CNN())

Epoch 1/5

Epoch 00001: val_accuracy improved from -inf to 0.24874, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f8f32d10>-model.h5
Epoch 2/5

Epoch 00002: val_accuracy improved from 0.24874 to 0.25716, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f8f32d10>-model.h5
Epoch 3/5

Epoch 00003: val_accuracy improved from 0.25716 to 0.39957, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f8f32d10>-model.h5
Epoch 4/5

Epoch 00004: val_accuracy improved from 0.39957 to 0.41015, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f8f32d10>-model.h5
Epoch 5/5

Epoch 00005: val_accuracy improved from 0.41015 to 0.52273, saving model to ./<keras.engine.sequential.Sequential object at 0x7890f8f32d10>-model.h5


In [27]:
testing(lstm_cnn_model)

              precision    recall  f1-score   support

           C       0.00      0.00      0.00         0
          C#       0.90      0.90      0.90      1005
         C++       0.00      0.00      0.00         0
        Dart       0.89      0.49      0.63      2766
        Diff       0.00      0.00      0.00         0
      Elixir       0.00      0.00      0.00         0
         GAS       0.00      0.00      0.00         0
        GLSL       0.00      0.00      0.00         0
          Go       0.62      0.37      0.47      1581
        JSON       0.46      0.77      0.58       316
        Java       0.00      0.00      0.00         3
  Javascript       0.03      0.17      0.05        65
       Julia       0.00      0.00      0.00         0
      Kotlin       0.00      0.00      0.00         0
    Markdown       0.00      0.00      0.00         0
         PHP       0.00      0.00      0.00         0
        Ruby       0.00      0.00      0.00         0
        Rust       0.87    